From 81cec266328755b17c0aea7c754f61884c0416fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Thu, 10 Jul 2025 03:24:13 -0400 Subject: [PATCH 01/39] Convert corpus structure to dict --- lib/corpus.py | 86 +++++++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 6e0b6c37e..2b8f666cc 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -45,7 +45,7 @@ class CorpusParser(sax.handler.ContentHandler): """ This classes methods are called by the sax-parser whenever it encounters an event in the xml-file (tags/characters/namespaces/...). It uses a stack of elements to remember the part of the corpus that - is currently beeing read. + is currently being read. """ def __init__(self, corpus: Corpus, path: str, *, reformat_orth: bool = True): @@ -77,7 +77,7 @@ def startElement(self, name: str, attrs: Dict[str, str]): subcorpus = Corpus() subcorpus.name = attrs["name"] subcorpus.parent_corpus = e - e.subcorpora.append(subcorpus) + e.subcorpora[subcorpus.name] = subcorpus self.elements.append(subcorpus) elif name == "include": assert isinstance(e, Corpus), " may only occur within a or element" @@ -88,12 +88,12 @@ def startElement(self, name: str, attrs: Dict[str, str]): print( "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name) ) - for sc in c.subcorpora: + for sc in c.subcorpora.values(): sc.parent_corpus = e.parent_corpus - for r in c.recordings: + for r in c.recordings.values(): r.corpus = e - e.subcorpora.extend(c.subcorpora) - e.recordings.extend(c.recordings) + e.subcorpora.update(c.subcorpora) + e.recordings.update(c.recordings) e.speakers.update(c.speakers) elif name == "recording": assert isinstance(e, Corpus), " may only occur within a or element" @@ -173,85 +173,75 @@ def __init__(self): self.parent_corpus: Optional[Corpus] = None - self.subcorpora: List[Corpus] = [] - self.recordings: List[Recording] = [] + self.subcorpora: Dict[str, Corpus] = {} # full-name: Corpus + self.recordings: Dict[str, Recording] = {} # full-name: Recording def segments(self) -> Iterable[Segment]: """ :return: an iterator over all segments within the corpus """ - for r in self.recordings: + for r in self.recordings.values(): yield from r.segments - for sc in self.subcorpora: + for sc in self.subcorpora.values(): yield from sc.segments() def get_recording_by_name(self, name: str) -> Recording: """ - :return: the recording specified by its name + :return: the recording specified by its full name """ - for rec in self.all_recordings(): - if rec.fullname() == name: - return rec - assert False, f"Recording '{name}' was not found in corpus" + assert name in self.recordings, f"Recording '{name}' was not found in corpus" + + return self.recordings[name] def get_segment_by_name(self, name: str) -> Segment: """ - :return: the segment specified by its name + :return: the segment specified by its full name """ - for seg in self.segments(): - if seg.fullname() == name: - return seg - assert False, f"Segment '{name}' was not found in corpus" + recording_name = "/".join(name.split("/")[:-1]) + assert recording_name in self.recordings, ( + f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus" + ) + + return self.recordings[recording_name].get_segment_by_name(name) def all_recordings(self) -> Iterable[Recording]: - yield from self.recordings - for sc in self.subcorpora: + yield from self.recordings.values() + for sc in self.subcorpora.values(): yield from sc.all_recordings() def all_speakers(self) -> Iterable[Speaker]: yield from self.speakers.values() - for sc in self.subcorpora: + for sc in self.subcorpora.values(): yield from sc.all_speakers() def top_level_recordings(self) -> Iterable[Recording]: - yield from self.recordings + yield from self.recordings.values() def top_level_subcorpora(self) -> Iterable[Corpus]: - yield from self.subcorpora + yield from self.subcorpora.values() def top_level_speakers(self) -> Iterable[Speaker]: yield from self.speakers.values() def remove_recording(self, recording: Recording): - to_delete = [] - for idx, r in enumerate(self.recordings): - if r is recording or r == recording or r.name == recording: - to_delete.append(idx) - for idx in reversed(to_delete): - del self.recordings[idx] - for sc in self.subcorpora: + if recording.name in self.recordings: + del self.recordings[recording.fullname()] + for sc in self.subcorpora.values(): sc.remove_recording(recording) def remove_recordings(self, recordings: List[Recording]): - recording_fullnames = {recording.fullname() for recording in recordings} - to_delete = [] - for idx, r in enumerate(self.recordings): - if r.fullname() in recording_fullnames: - to_delete.append(idx) - for idx in reversed(to_delete): - del self.recordings[idx] - for sc in self.subcorpora: - sc.remove_recordings(recordings) + for r in recordings: + self.remove_recording(r) def add_recording(self, recording: Recording): assert isinstance(recording, Recording) recording.corpus = self - self.recordings.append(recording) + self.recordings[recording.fullname()] = recording def add_subcorpus(self, corpus: Corpus): assert isinstance(corpus, Corpus) corpus.parent_corpus = self - self.subcorpora.append(corpus) + self.subcorpora[corpus.fullname()] = corpus def add_speaker(self, speaker: Speaker): assert isinstance(speaker, Speaker) @@ -281,9 +271,9 @@ def filter_segments(self, filter_function: FilterFunction): filter all segments (including in subcorpora) using filter_function :param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept """ - for r in self.recordings: - r.segments = [s for s in r.segments if filter_function(self, r, s)] - for sc in self.subcorpora: + for r in self.recordings.values(): + r.segments = {s.fullname(): s for s in r.segments.values() if filter_function(self, r, s)} + for sc in self.subcorpora.values(): sc.filter_segments(filter_function) def load(self, path: str, *, reformat_orth: bool = True): @@ -322,10 +312,10 @@ def _dump_internal(self, out: TextIO, indentation: str = ""): if self.speaker_name is not None: out.write('%s \n' % (indentation, self.speaker_name)) - for r in self.recordings: + for r in self.recordings.values(): r.dump(out, indentation + " ") - for sc in self.subcorpora: + for sc in self.subcorpora.values(): sc._dump_internal(out, indentation + " ") if self.parent_corpus is None: From 3ba9608dd3f3cd09c84cd449b3b11861a377465c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Thu, 10 Jul 2025 03:24:27 -0400 Subject: [PATCH 02/39] Convert recording structure to dict --- lib/corpus.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 2b8f666cc..226e2f0fd 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -344,7 +344,7 @@ def __init__(self): super().__init__() self.audio: Optional[str] = None self.corpus: Optional[Corpus] = None - self.segments: List[Segment] = [] + self.segments: Dict[str, Segment] = {} def fullname(self) -> str: return self.corpus.fullname() + "/" + self.name @@ -365,21 +365,26 @@ def dump(self, out: TextIO, indentation: str = ""): if self.speaker_name is not None: out.write('%s \n' % (indentation, self.speaker_name)) - for s in self.segments: + for s in self.segments.values(): s.dump(out, indentation + " ") out.write("%s\n" % indentation) + def get_segment_by_name(self, name: str): + assert name in self.segments, f"Segment '{name}' was not found in recording '{self.name}'" + + return self.segments[name] + def add_segment(self, segment: Segment): assert isinstance(segment, Segment) segment.recording = self - self.segments.append(segment) + self.segments[segment.fullname()] = segment def get_segment_mapping(self) -> Dict[str, Segment]: """ :return: Mapping from segment fullnames to actual segments. """ - return {seg.fullname(): seg for seg in self.segments} + return {seg.fullname(): seg for seg in self.segments.values()} def __repr__(self): return f"<{self.__class__.__name__} {self.fullname()}>" From b066017304744a2d026654035904a62faf9e0f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Thu, 10 Jul 2025 03:53:30 -0400 Subject: [PATCH 03/39] Fix `Corpus.segments()` call Now return the actual segments instead of the segment full names, following the previous commit --- lib/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/corpus.py b/lib/corpus.py index 226e2f0fd..8833f2070 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -181,7 +181,7 @@ def segments(self) -> Iterable[Segment]: :return: an iterator over all segments within the corpus """ for r in self.recordings.values(): - yield from r.segments + yield from r.segments.values() for sc in self.subcorpora.values(): yield from sc.segments() From 16c7271ec84fffb42ea2ca8419aa86d5924f897c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= <31628502+Icemole@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:17:18 +0200 Subject: [PATCH 04/39] Use `rsplit` instead of splitting and concatenating back Co-authored-by: Albert Zeyer --- lib/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/corpus.py b/lib/corpus.py index 8833f2070..da45669a4 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -197,7 +197,7 @@ def get_segment_by_name(self, name: str) -> Segment: """ :return: the segment specified by its full name """ - recording_name = "/".join(name.split("/")[:-1]) + recording_name, _ = name.rsplit("/", 1) assert recording_name in self.recordings, ( f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus" ) From 28ce8893b04fd1eea91151a55753c65e35965af0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 14 Jul 2025 06:26:34 -0400 Subject: [PATCH 05/39] Add recording at the beginning --- tests/job_tests/corpus/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/job_tests/corpus/test_transform.py b/tests/job_tests/corpus/test_transform.py index 9b71e54c6..f074a7afa 100644 --- a/tests/job_tests/corpus/test_transform.py +++ b/tests/job_tests/corpus/test_transform.py @@ -30,13 +30,13 @@ def _create_corpus_with_structure(corpus_dict: Dict[str, Any]) -> libcorpus.Corp corpus.name = corpus_dict["name"] for recording_dict in corpus_dict.get("recordings", []): recording = libcorpus.Recording() + corpus.add_recording(recording) recording.name = recording_dict["name"] for segment_name in recording_dict.get("segments", []): segment = libcorpus.Segment() segment.name = segment_name segment.orth = "" recording.add_segment(segment) - corpus.add_recording(recording) for subcorpus_dict in corpus_dict.get("subcorpora", []): corpus.add_subcorpus(_CorpusCreatorHelper._create_corpus_with_structure(subcorpus_dict)) From 14247a5143888ca407d98d665cf7990e0428b53b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 14 Jul 2025 06:37:12 -0400 Subject: [PATCH 06/39] Fix --- tests/job_tests/corpus/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/job_tests/corpus/test_transform.py b/tests/job_tests/corpus/test_transform.py index f074a7afa..db9f517e5 100644 --- a/tests/job_tests/corpus/test_transform.py +++ b/tests/job_tests/corpus/test_transform.py @@ -30,8 +30,8 @@ def _create_corpus_with_structure(corpus_dict: Dict[str, Any]) -> libcorpus.Corp corpus.name = corpus_dict["name"] for recording_dict in corpus_dict.get("recordings", []): recording = libcorpus.Recording() - corpus.add_recording(recording) recording.name = recording_dict["name"] + corpus.add_recording(recording) for segment_name in recording_dict.get("segments", []): segment = libcorpus.Segment() segment.name = segment_name From 4cd832ac43ab84b9b1975ea8967cceb88d54303f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 14 Jul 2025 06:39:46 -0400 Subject: [PATCH 07/39] Add name to NamedEntity/Corpus/Recording/Segment init --- lib/corpus.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index da45669a4..c6b9feec2 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -21,9 +21,9 @@ class NamedEntity: - def __init__(self): + def __init__(self, name: Optional[str] = None): super().__init__() - self.name: Optional[str] = None + self.name = name def __repr__(self): if self.name is None: @@ -168,8 +168,11 @@ class Corpus(NamedEntity, CorpusSection): attribute is set. Corpora with include statements can be read but are written back as a single file. """ - def __init__(self): - super().__init__() + def __init__(self, name: Optional[str] = None): + """ + :param name: Corpus name. + """ + super().__init__(name=name) self.parent_corpus: Optional[Corpus] = None @@ -340,8 +343,12 @@ def __repr__(self): class Recording(NamedEntity, CorpusSection): - def __init__(self): - super().__init__() + def __init__(self, name: Optional[str] = None): + """ + :param name: Recording name. + """ + super().__init__(name=name) + self.audio: Optional[str] = None self.corpus: Optional[Corpus] = None self.segments: Dict[str, Segment] = {} @@ -394,6 +401,7 @@ class Segment(NamedEntity): def __init__( self, *, + name: Optional[str] = None, start: float = 0.0, end: float = 0.0, track: Optional[int] = None, @@ -413,7 +421,7 @@ def __init__( :param speaker_name: Speaker name. :param recording: Recording in which the segment is embedded. """ - super().__init__() + super().__init__(name=name) self.start = start self.end = end From dc8b4e4288de7bdee34d325f1b6ac87ae9538d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 14 Jul 2025 06:41:49 -0400 Subject: [PATCH 08/39] Use newly declared parameters in init --- tests/job_tests/corpus/test_transform.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/job_tests/corpus/test_transform.py b/tests/job_tests/corpus/test_transform.py index db9f517e5..036d0b442 100644 --- a/tests/job_tests/corpus/test_transform.py +++ b/tests/job_tests/corpus/test_transform.py @@ -26,16 +26,12 @@ def _create_corpus_with_structure(corpus_dict: Dict[str, Any]) -> libcorpus.Corp :param corpus_dict: Definition of a corpus in dictionary form. :return: Corpus object defined by the corpus dictionary provided. """ - corpus = libcorpus.Corpus() - corpus.name = corpus_dict["name"] + corpus = libcorpus.Corpus(name=corpus_dict["name"]) for recording_dict in corpus_dict.get("recordings", []): - recording = libcorpus.Recording() - recording.name = recording_dict["name"] + recording = libcorpus.Recording(name=recording_dict["name"]) corpus.add_recording(recording) for segment_name in recording_dict.get("segments", []): - segment = libcorpus.Segment() - segment.name = segment_name - segment.orth = "" + segment = libcorpus.Segment(name=segment_name, orth="") recording.add_segment(segment) for subcorpus_dict in corpus_dict.get("subcorpora", []): corpus.add_subcorpus(_CorpusCreatorHelper._create_corpus_with_structure(subcorpus_dict)) From 9d2387221ac93d7537cf535044ae9651dfa104d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= <31628502+Icemole@users.noreply.github.com> Date: Tue, 15 Jul 2025 11:07:26 +0200 Subject: [PATCH 09/39] Directly copy self segments Co-authored-by: DanEnergetics --- lib/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/corpus.py b/lib/corpus.py index c6b9feec2..08d7ae1fc 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -391,7 +391,7 @@ def get_segment_mapping(self) -> Dict[str, Segment]: """ :return: Mapping from segment fullnames to actual segments. """ - return {seg.fullname(): seg for seg in self.segments.values()} + return self.segments.copy() def __repr__(self): return f"<{self.__class__.__name__} {self.fullname()}>" From a673f833c17edd87b895a75d9341b09a1a84eb6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 05:46:36 -0400 Subject: [PATCH 10/39] Better init --- lib/corpus.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 08d7ae1fc..0591d7066 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -74,10 +74,8 @@ def startElement(self, name: str, attrs: Dict[str, str]): e.name = attrs["name"] elif name == "subcorpus": assert isinstance(e, Corpus), " may only occur within a or element" - subcorpus = Corpus() - subcorpus.name = attrs["name"] - subcorpus.parent_corpus = e - e.subcorpora[subcorpus.name] = subcorpus + subcorpus = Corpus(name=attrs["name"]) + e.add_subcorpus(subcorpus) self.elements.append(subcorpus) elif name == "include": assert isinstance(e, Corpus), " may only occur within a or element" @@ -97,9 +95,7 @@ def startElement(self, name: str, attrs: Dict[str, str]): e.speakers.update(c.speakers) elif name == "recording": assert isinstance(e, Corpus), " may only occur within a or element" - rec = Recording() - rec.name = attrs["name"] - rec.audio = attrs["audio"] + rec = Recording(name=attrs["name"], audio=attrs["audio"]) e.add_recording(rec) self.elements.append(rec) elif name == "segment": From 1ebeb7db00a7d05928e3dfd0be4f1b85f67576cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 05:46:59 -0400 Subject: [PATCH 11/39] Corpus: add subcorpora, recordings properties as read only --- lib/corpus.py | 52 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 0591d7066..0f5ad86ef 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -172,59 +172,71 @@ def __init__(self, name: Optional[str] = None): self.parent_corpus: Optional[Corpus] = None - self.subcorpora: Dict[str, Corpus] = {} # full-name: Corpus - self.recordings: Dict[str, Recording] = {} # full-name: Recording + self._subcorpora: Dict[str, Corpus] = {} # full-name: Corpus + self._recordings: Dict[str, Recording] = {} # full-name: Recording + + @property + def subcorpora(self): + """ + Read-only property. If one wants to add a subcorpus to this corpus, please use :func:`Corpus.add_subcorpus`. + """ + return self._subcorpora.values() + + @property + def recordings(self): + """ + Read-only property. If one wants to add a recording to this corpus, please use :func:`Corpus.add_recording`. + """ + return self._recordings.values() def segments(self) -> Iterable[Segment]: """ :return: an iterator over all segments within the corpus """ - for r in self.recordings.values(): - yield from r.segments.values() - for sc in self.subcorpora.values(): + for r in self.recordings: + yield from r.segments + for sc in self.subcorpora: yield from sc.segments() def get_recording_by_name(self, name: str) -> Recording: """ :return: the recording specified by its full name """ - assert name in self.recordings, f"Recording '{name}' was not found in corpus" - - return self.recordings[name] + assert name in self._recordings, f"Recording '{name}' was not found in corpus" + return self._recordings[name] def get_segment_by_name(self, name: str) -> Segment: """ :return: the segment specified by its full name """ recording_name, _ = name.rsplit("/", 1) - assert recording_name in self.recordings, ( + assert recording_name in self._recordings, ( f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus" ) - - return self.recordings[recording_name].get_segment_by_name(name) + return self._recordings[recording_name].get_segment_by_name(name) def all_recordings(self) -> Iterable[Recording]: - yield from self.recordings.values() - for sc in self.subcorpora.values(): + yield from self.recordings + for sc in self.subcorpora: yield from sc.all_recordings() def all_speakers(self) -> Iterable[Speaker]: yield from self.speakers.values() - for sc in self.subcorpora.values(): + for sc in self.subcorpora: yield from sc.all_speakers() def top_level_recordings(self) -> Iterable[Recording]: - yield from self.recordings.values() + yield from self.recordings def top_level_subcorpora(self) -> Iterable[Corpus]: - yield from self.subcorpora.values() + yield from self.subcorpora def top_level_speakers(self) -> Iterable[Speaker]: yield from self.speakers.values() def remove_recording(self, recording: Recording): - if recording.name in self.recordings: - del self.recordings[recording.fullname()] + if recording.name in self._recordings: + del self._recordings[recording.fullname()] for sc in self.subcorpora.values(): sc.remove_recording(recording) @@ -235,12 +247,12 @@ def remove_recordings(self, recordings: List[Recording]): def add_recording(self, recording: Recording): assert isinstance(recording, Recording) recording.corpus = self - self.recordings[recording.fullname()] = recording + self._recordings[recording.fullname()] = recording def add_subcorpus(self, corpus: Corpus): assert isinstance(corpus, Corpus) corpus.parent_corpus = self - self.subcorpora[corpus.fullname()] = corpus + self._subcorpora[corpus.fullname()] = corpus def add_speaker(self, speaker: Speaker): assert isinstance(speaker, Speaker) From 1e678e83ece4348745bdc5ce1706effd85840c33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 05:48:23 -0400 Subject: [PATCH 12/39] Update filter segments function --- lib/corpus.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 0f5ad86ef..587ad7677 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -282,10 +282,12 @@ def filter_segments(self, filter_function: FilterFunction): filter all segments (including in subcorpora) using filter_function :param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept """ - for r in self.recordings.values(): - r.segments = {s.fullname(): s for s in r.segments.values() if filter_function(self, r, s)} - for sc in self.subcorpora.values(): - sc.filter_segments(filter_function) + for rec_full_name, r in self._recordings.items(): + self._recordings[rec_full_name]._segments = { + s.fullname(): s for s in r.segments.values() if filter_function(self, r, s) + } + for subcorpus_full_name in self._subcorpora(): + self._subcorpora[subcorpus_full_name].filter_segments(filter_function) def load(self, path: str, *, reformat_orth: bool = True): """ From de59bece355f9d51374fef442d87e2e1d04b6658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 05:48:47 -0400 Subject: [PATCH 13/39] Corpus: add subcorpora, recordings as properties (2) --- lib/corpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 587ad7677..e0b18b647 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -325,10 +325,10 @@ def _dump_internal(self, out: TextIO, indentation: str = ""): if self.speaker_name is not None: out.write('%s \n' % (indentation, self.speaker_name)) - for r in self.recordings.values(): + for r in self.recordings: r.dump(out, indentation + " ") - for sc in self.subcorpora.values(): + for sc in self.subcorpora: sc._dump_internal(out, indentation + " ") if self.parent_corpus is None: From b6003a3a768cff165ae699e2d77a060388d2b6d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 05:49:01 -0400 Subject: [PATCH 14/39] Recording: add segments as property --- lib/corpus.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index e0b18b647..139fbcd68 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -353,17 +353,24 @@ def __repr__(self): class Recording(NamedEntity, CorpusSection): - def __init__(self, name: Optional[str] = None): + def __init__(self, name: Optional[str] = None, audio: Optional[str] = None): """ :param name: Recording name. """ super().__init__(name=name) - self.audio: Optional[str] = None + self.audio = audio self.corpus: Optional[Corpus] = None - self.segments: Dict[str, Segment] = {} + self._segments: Dict[str, Segment] = {} + + @property + def segments(self): + return self._segments.values() def fullname(self) -> str: + assert self.corpus is not None, ( + "Please add the recording to a corpus via Corpus.add_recording() before triggering fullname()." + ) return self.corpus.fullname() + "/" + self.name def speaker(self, speaker_name: Optional[str] = None) -> Speaker: @@ -382,26 +389,25 @@ def dump(self, out: TextIO, indentation: str = ""): if self.speaker_name is not None: out.write('%s \n' % (indentation, self.speaker_name)) - for s in self.segments.values(): + for s in self.segments: s.dump(out, indentation + " ") out.write("%s\n" % indentation) def get_segment_by_name(self, name: str): - assert name in self.segments, f"Segment '{name}' was not found in recording '{self.name}'" - - return self.segments[name] + assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'" + return self._segments[name] def add_segment(self, segment: Segment): assert isinstance(segment, Segment) segment.recording = self - self.segments[segment.fullname()] = segment + self._segments[segment.fullname()] = segment def get_segment_mapping(self) -> Dict[str, Segment]: """ :return: Mapping from segment fullnames to actual segments. """ - return self.segments.copy() + return self._segments.copy() def __repr__(self): return f"<{self.__class__.__name__} {self.fullname()}>" From a1ea859f23559cdebbcd97560bbd809d7f491b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 05:49:30 -0400 Subject: [PATCH 15/39] Segment: add assertion in fullname --- lib/corpus.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/corpus.py b/lib/corpus.py index 139fbcd68..aeeda2567 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -456,6 +456,9 @@ def full_orth(self) -> str: return " ".join([s for s in [self.left_context_orth, self.orth, self.right_context_orth] if s]) def fullname(self) -> str: + assert self.recording is not None, ( + "Please add the recording to a corpus via Recording.add_segment() before triggering Segment.fullname()." + ) return self.recording.fullname() + "/" + self.name def speaker(self) -> Speaker: From 6d41c465026b99f1a5cadfe8e4a44bf2b29052a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 05:58:01 -0400 Subject: [PATCH 16/39] Always return iterables --- lib/corpus.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index aeeda2567..1dfd8fb97 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -176,14 +176,14 @@ def __init__(self, name: Optional[str] = None): self._recordings: Dict[str, Recording] = {} # full-name: Recording @property - def subcorpora(self): + def subcorpora(self) -> Iterable[Corpus]: """ Read-only property. If one wants to add a subcorpus to this corpus, please use :func:`Corpus.add_subcorpus`. """ return self._subcorpora.values() @property - def recordings(self): + def recordings(self) -> Iterable[Recording]: """ Read-only property. If one wants to add a recording to this corpus, please use :func:`Corpus.add_recording`. """ @@ -364,7 +364,10 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None): self._segments: Dict[str, Segment] = {} @property - def segments(self): + def segments(self) -> Iterable[Segment]: + """ + Read-only property. If one wants to add a segment to this recording, please use :func:`Recording.add_segment`. + """ return self._segments.values() def fullname(self) -> str: From d5fa7a942d280be03b8f04461ca94de5f457bf30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 06:26:08 -0400 Subject: [PATCH 17/39] Set explicit read only properties --- lib/corpus.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 1dfd8fb97..8101807b9 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -178,17 +178,35 @@ def __init__(self, name: Optional[str] = None): @property def subcorpora(self) -> Iterable[Corpus]: """ - Read-only property. If one wants to add a subcorpus to this corpus, please use :func:`Corpus.add_subcorpus`. + Read-only property. + + :return: Iterable of all top-level subcorpora. """ return self._subcorpora.values() + @subcorpora.setter + def subcorpora(self): + raise AttributeError( + "Corpus.subcorpora is a read-only attribute. " + "If you're using old code, please use the current proper API function." + ) + @property def recordings(self) -> Iterable[Recording]: """ - Read-only property. If one wants to add a recording to this corpus, please use :func:`Corpus.add_recording`. + Read-only property. + + :return: Iterable of all top-level recordings. """ return self._recordings.values() + @recordings.setter + def recordings(self): + raise AttributeError( + "Corpus.recordings is a read-only attribute. " + "If you're using old code, please use the current proper API function." + ) + def segments(self) -> Iterable[Segment]: """ :return: an iterator over all segments within the corpus @@ -353,14 +371,15 @@ def __repr__(self): class Recording(NamedEntity, CorpusSection): - def __init__(self, name: Optional[str] = None, audio: Optional[str] = None): + def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corpus: Optional[Corpus] = None): """ :param name: Recording name. """ super().__init__(name=name) self.audio = audio - self.corpus: Optional[Corpus] = None + if corpus: + corpus.add_recording(self) self._segments: Dict[str, Segment] = {} @property @@ -370,6 +389,13 @@ def segments(self) -> Iterable[Segment]: """ return self._segments.values() + @segments.setter + def segments(self): + raise AttributeError( + "Recording.segments is a read-only property. " + "If you're using old code, please use the current proper API function." + ) + def fullname(self) -> str: assert self.corpus is not None, ( "Please add the recording to a corpus via Corpus.add_recording() before triggering fullname()." @@ -402,6 +428,9 @@ def get_segment_by_name(self, name: str): return self._segments[name] def add_segment(self, segment: Segment): + assert self.corpus is not None, ( + "The recording must be added to a corpus via Corpus.add_recording() before using Recording.add_segment()." + ) assert isinstance(segment, Segment) segment.recording = self self._segments[segment.fullname()] = segment From 151966c8abdead4654040fd9aa957513e9e11879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 06:32:35 -0400 Subject: [PATCH 18/39] Improve docstring --- lib/corpus.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/corpus.py b/lib/corpus.py index 8101807b9..f305361fe 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -423,7 +423,12 @@ def dump(self, out: TextIO, indentation: str = ""): out.write("%s\n" % indentation) - def get_segment_by_name(self, name: str): + def get_segment_by_name(self, name: str) -> Segment: + """ + :param name: Full name of the segment. + + :return: Segment which is identified by the full name specified in :param:`name`. + """ assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'" return self._segments[name] From b406ac795472d79e45b2c68c599db0763b1427a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 06:32:43 -0400 Subject: [PATCH 19/39] Add remove_segment call --- lib/corpus.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/corpus.py b/lib/corpus.py index f305361fe..df5ee0178 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -440,6 +440,12 @@ def add_segment(self, segment: Segment): segment.recording = self self._segments[segment.fullname()] = segment + def remove_segment(self, segment: Segment): + assert segment.fullname() in self._segments, ( + f"Segment '{segment.fullname()}' was not found in recording '{self.name}'" + ) + del self._segments[segment.fullname()] + def get_segment_mapping(self) -> Dict[str, Segment]: """ :return: Mapping from segment fullnames to actual segments. From 4d356dfe9ac105458f15d390cb337f69ef51b7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 06:48:18 -0400 Subject: [PATCH 20/39] Fix recording segments call --- lib/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/corpus.py b/lib/corpus.py index df5ee0178..cd658ce62 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -302,7 +302,7 @@ def filter_segments(self, filter_function: FilterFunction): """ for rec_full_name, r in self._recordings.items(): self._recordings[rec_full_name]._segments = { - s.fullname(): s for s in r.segments.values() if filter_function(self, r, s) + s.fullname(): s for s in r.segments if filter_function(self, r, s) } for subcorpus_full_name in self._subcorpora(): self._subcorpora[subcorpus_full_name].filter_segments(filter_function) From 42e69d204cc30348308dc98fab067d2e5bc07c6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 06:48:33 -0400 Subject: [PATCH 21/39] Fix Recording.segments alls throughout the repo --- audio/ffmpeg.py | 5 +++-- corpus/data_augmentation.py | 11 +++++------ corpus/filter.py | 29 ++++++++++++++++------------- corpus/transform.py | 19 +++++++++---------- datasets/librispeech.py | 7 ++----- datasets/switchboard.py | 7 ++----- lib/corpus.py | 4 ++-- 7 files changed, 39 insertions(+), 43 deletions(-) diff --git a/audio/ffmpeg.py b/audio/ffmpeg.py index 9809b8eb2..b5ce0331a 100644 --- a/audio/ffmpeg.py +++ b/audio/ffmpeg.py @@ -159,8 +159,9 @@ def run_recover_duration(self): c.load("temp_corpus.xml.gz") for r in c.all_recordings(): - assert len(r.segments) == 1, "needs to be a single segment recording" - segment = r.segments[0] + recording_segments = list(r.segments) + assert len(recording_segments) == 1, "needs to be a single segment recording" + segment = recording_segments[0] old_duration = segment.end assert r.audio is not None data, sample_rate = soundfile.read(open(r.audio, "rb")) diff --git a/corpus/data_augmentation.py b/corpus/data_augmentation.py index 0be616aad..40d9ced09 100644 --- a/corpus/data_augmentation.py +++ b/corpus/data_augmentation.py @@ -134,15 +134,14 @@ def run(self): command = ffmpeg_head + noise_inputs + filter_head + volume_reduction + mixer + filter_tail self.sh(command) - nr = corpus.Recording() + nr = corpus.Recording(corpus=nc) nr.name = r.name - nr.segments = r.segments nr.speaker_name = r.speaker_name nr.default_speaker = r.default_speaker nr.speakers = r.speakers nr.audio = str(self.out_audio_folder) + "/" + reverbed_audio_name - nc.add_recording(nr) - for s in nr.segments: + for s in r.segments: + nr.add_segment(s) segment_file_names.append(nc.name + "/" + nr.name + "/" + s.name + "\n") nc.dump(self.out_corpus.get_path()) @@ -205,13 +204,13 @@ def run(self): pr = corpus.Recording() pr.name = r.name - pr.segments = r.segments pr.speaker_name = r.speaker_name pr.speakers = r.speakers pr.default_speaker = r.default_speaker pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name nc.add_recording(pr) - for s in pr.segments: + for s in r.segments: + pr.add_segment(s) segment_file_names.append(nc.name + "/" + pr.name + "/" + s.name) s.start /= self.speed_factor s.end /= self.speed_factor diff --git a/corpus/filter.py b/corpus/filter.py index fc374af17..f518dd1b9 100644 --- a/corpus/filter.py +++ b/corpus/filter.py @@ -32,14 +32,11 @@ def _delete_empty_recordings(corpus: corpus.Corpus, removed_recordings_file: str :param c: Corpus for which to delete the empty recordings. :param removed_recordings_file: File in which to dump all recordings that have been deleted. """ - to_delete = [] - for rec in corpus.all_recordings(): - if not rec.segments: - to_delete.append(rec) - - corpus.remove_recordings(to_delete) with open(removed_recordings_file, "w") as f: - f.write("\n".join(rec.fullname() for rec in to_delete)) + for rec in corpus.all_recordings(): + if not rec.segments: + corpus.remove_recording(rec) + f.write(f"{rec.fullname()}\n") class FilterSegmentsByListJob(Job): @@ -440,10 +437,16 @@ def run(self): c.load(tk.uncached_path(self.bliss_corpus)) for rec in c.all_recordings(): - if self.invert_match: - rec.segments = [x for x in rec.segments if x.fullname() not in segments and x.name not in segments] - else: - rec.segments = [x for x in rec.segments if x.fullname() in segments or x.name in segments] + segments_to_delete = [] + for s in rec.segments: + if self.invert_match: + if s.fullname() in segments or s.name in segments: + segments_to_delete.append(s) + else: + if s.fullname() not in segments and s.name not in segments: + segments_to_delete.append(s) + for s in segments_to_delete: + rec.remove_segment(s) if self.delete_empty_recordings: # Remove the recordings without segments due to the filtering. @@ -527,7 +530,7 @@ def maybe_to_lower(s): c = corpus.Corpus() c.load(self.corpus.get_path()) - num_segments_per_recording = {r.fullname(): len(r.segments) for r in c.all_recordings()} + num_segments_per_recording = {r.fullname(): len(list(r.segments)) for r in c.all_recordings()} # use var name instead of attribute to avoid problem with name scope log_oov_list = self.log_oov_list @@ -577,7 +580,7 @@ def __call__(self, corpus: corpus.Corpus, recording: corpus.Recording, segment: recordings_to_be_removed = [] for r in c.all_recordings(): num_seg = num_segments_per_recording[r.fullname()] - new_num_seg = len(r.segments) + new_num_seg = len(list(r.segments)) if num_seg and (num_seg - new_num_seg) / num_seg > self.recording_oov_tolerance: recordings_to_be_removed.append(r) diff --git a/corpus/transform.py b/corpus/transform.py index 348ac98df..9605329d3 100644 --- a/corpus/transform.py +++ b/corpus/transform.py @@ -70,22 +70,22 @@ def run(self): words = [s[1] for s in transcriptions[recording.name]] if len(words) == 0 and self.remove_empty_segments: - recordings_to_delete = recording + recordings_to_delete = recording # TODO: this does nothing. continue segments_to_delete = [] - for idx, segment in enumerate(recording.segments): + for segment in recording.segments: left_idx = bisect.bisect_left(times, segment.start) right_idx = bisect.bisect_left(times, segment.end) if left_idx == right_idx and self.remove_empty_segments: - segments_to_delete.append(idx) + segments_to_delete.append(segment) continue segment.orth = " ".join(words[left_idx:right_idx]).replace("&", "&") - for sidx in reversed(segments_to_delete): - del recording.segments[sidx] + for segment in reversed(segments_to_delete): + recording.remove_segment(segment) c.dump(self.output_corpus_path.get_path()) @@ -223,7 +223,7 @@ def run(self): sm_entry.value = "/".join([c.name, split_name, segment.name]) sm.map_entries.append(sm_entry) - new_recording_element.segments.append(segment) + new_recording_element.add_segment(segment) segment_count += 1 # update the time stamp with the recording length and add to ffmpeg merge list @@ -547,15 +547,14 @@ def run(self): nc.speaker_name = c.speaker_name # store index of last segment for r in c.recordings: - sr = corpus.Recording() + sr = corpus.Recording(corpus=nc) sr.name = r.name - sr.segments = r.segments sr.speaker_name = r.speaker_name sr.speakers = r.speakers sr.default_speaker = r.default_speaker sr.audio = r.audio - nc.add_recording(sr) - for s in sr.segments: + for s in r.segments: + sr.add_segment(s) segment_file_names.append(nc.name + "/" + sr.name + "/" + s.name) s.start += self.shift diff --git a/datasets/librispeech.py b/datasets/librispeech.py index fca330ecb..2b8840ce5 100644 --- a/datasets/librispeech.py +++ b/datasets/librispeech.py @@ -136,22 +136,19 @@ def run(self): for transcript in self._transcripts: name = "{0}-{1}-{2:04d}".format(transcript["speaker_id"], transcript["chapter"], transcript["segment"]) - recording = corpus.Recording() + recording = corpus.Recording(corpus=c) recording.name = name recording.speaker_name = transcript["speaker_id"] recording.audio = "{}/{}.flac".format(transcript["path"], name) used_speaker_ids.add(transcript["speaker_id"]) - segment = corpus.Segment() + segment = corpus.Segment(recording=recording) segment.name = name segment.start = 0 segment.end = float("inf") segment.orth = transcript["orth"].strip() - recording.segments.append(segment) - c.recordings.append(recording) - for speaker_id, speaker_info in sorted(self._speakers.items()): if speaker_id not in used_speaker_ids: continue diff --git a/datasets/switchboard.py b/datasets/switchboard.py index d66177509..1361375fa 100644 --- a/datasets/switchboard.py +++ b/datasets/switchboard.py @@ -262,7 +262,7 @@ def run(self): rec_to_segs.pop("sw02167B") for rec_name, segs in sorted(rec_to_segs.items()): - recording = corpus.Recording() + recording = corpus.Recording(corpus=c) recording.name = rec_name recording.audio = os.path.join(self.audio_dir.get_path(), rec_name + ".wav") @@ -272,7 +272,7 @@ def run(self): rec_speaker_id = rec_to_speaker[rec_name]["speaker_id"] for seg in segs: - segment = corpus.Segment() + segment = corpus.Segment(recording=recording) segment.name = seg[0] segment.start = float(seg[1]) segment.end = float(seg[2]) @@ -281,9 +281,6 @@ def run(self): if len(segment.orth) == 0: continue - recording.segments.append(segment) - c.recordings.append(recording) - # add speakers to corpus for speaker_info in rec_to_speaker.values(): speaker = corpus.Speaker() diff --git a/lib/corpus.py b/lib/corpus.py index cd658ce62..fb870a860 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -489,8 +489,8 @@ def __init__( self.left_context_orth = left_context_orth self.right_context_orth = right_context_orth self.speaker_name = speaker_name - - self.recording = recording + if recording: + recording.add_segment(self) def full_orth(self) -> str: """ From f1f0d73612ee755283fb98ff4fa7ce39b08a6926 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 09:48:12 -0400 Subject: [PATCH 22/39] Add proper setters --- lib/corpus.py | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index fb870a860..48b38803e 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -178,34 +178,38 @@ def __init__(self, name: Optional[str] = None): @property def subcorpora(self) -> Iterable[Corpus]: """ - Read-only property. - :return: Iterable of all top-level subcorpora. """ return self._subcorpora.values() @subcorpora.setter - def subcorpora(self): - raise AttributeError( - "Corpus.subcorpora is a read-only attribute. " - "If you're using old code, please use the current proper API function." + def subcorpora(self, value: List[Corpus]): + """ + :param value: List of subcorpora that the recording must hold. + The previous subcorpora will be overwritten. + """ + assert isinstance(value, list) and all(isinstance(c, Corpus) for c in value), ( + f"Can only set Corpus.subcorpora to a list, but tried setting it to {type(value)}." ) + self._subcorpora = {c.fullname(): c for c in value} @property def recordings(self) -> Iterable[Recording]: """ - Read-only property. - :return: Iterable of all top-level recordings. """ return self._recordings.values() @recordings.setter - def recordings(self): - raise AttributeError( - "Corpus.recordings is a read-only attribute. " - "If you're using old code, please use the current proper API function." + def recordings(self, value: List[Recording]): + """ + :param value: List of recordings that the corpus must hold. + The previous recordings will be overwritten. + """ + assert isinstance(value, list) and all(isinstance(r, Recording) for r in value), ( + f"Can only set Corpus.recordings to a list, but tried setting it to {type(value)}." ) + self._recordings = {r.fullname(): r for r in value} def segments(self) -> Iterable[Segment]: """ @@ -385,16 +389,20 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corp @property def segments(self) -> Iterable[Segment]: """ - Read-only property. If one wants to add a segment to this recording, please use :func:`Recording.add_segment`. + :return: Iterable of all segments in a recording. """ return self._segments.values() @segments.setter - def segments(self): - raise AttributeError( - "Recording.segments is a read-only property. " - "If you're using old code, please use the current proper API function." + def segments(self, value: List[Segment]): + """ + :param value: List of segments that the recording must hold. + The previous segments will be overwritten. + """ + assert isinstance(value, list) and all(isinstance(s, Segment) for s in value), ( + f"Can only set Recording.segments to a list, but tried setting it to {type(value)}." ) + self._recordings = {s.fullname(): s for s in value} def fullname(self) -> str: assert self.corpus is not None, ( From f671320032ab7ecc0cc3f7d1d9d84e940ff93293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 09:52:08 -0400 Subject: [PATCH 23/39] Take advantage of setter --- corpus/filter.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/corpus/filter.py b/corpus/filter.py index f518dd1b9..5f3cf2d03 100644 --- a/corpus/filter.py +++ b/corpus/filter.py @@ -437,16 +437,10 @@ def run(self): c.load(tk.uncached_path(self.bliss_corpus)) for rec in c.all_recordings(): - segments_to_delete = [] - for s in rec.segments: - if self.invert_match: - if s.fullname() in segments or s.name in segments: - segments_to_delete.append(s) - else: - if s.fullname() not in segments and s.name not in segments: - segments_to_delete.append(s) - for s in segments_to_delete: - rec.remove_segment(s) + if self.invert_match: + rec.segments = [x for x in rec.segments if x.fullname() not in segments and x.name not in segments] + else: + rec.segments = [x for x in rec.segments if x.fullname() in segments or x.name in segments] if self.delete_empty_recordings: # Remove the recordings without segments due to the filtering. From 8472e3ef99b2bf90c7f2fa8a63fe79922aba2ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 09:52:28 -0400 Subject: [PATCH 24/39] Fix recording call --- corpus/data_augmentation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/corpus/data_augmentation.py b/corpus/data_augmentation.py index 40d9ced09..5b2413472 100644 --- a/corpus/data_augmentation.py +++ b/corpus/data_augmentation.py @@ -202,13 +202,12 @@ def run(self): "-ar {base_frequency} '{audio_out}/%s'" % (r.audio, perturbed_audio_name) ) - pr = corpus.Recording() + pr = corpus.Recording(corpus=nc) pr.name = r.name pr.speaker_name = r.speaker_name pr.speakers = r.speakers pr.default_speaker = r.default_speaker pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name - nc.add_recording(pr) for s in r.segments: pr.add_segment(s) segment_file_names.append(nc.name + "/" + pr.name + "/" + s.name) From cb4856fed999eee88d7093778e3fd47526cb49c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 09:58:45 -0400 Subject: [PATCH 25/39] More fixes --- corpus/data_augmentation.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/corpus/data_augmentation.py b/corpus/data_augmentation.py index 5b2413472..7de1a7820 100644 --- a/corpus/data_augmentation.py +++ b/corpus/data_augmentation.py @@ -77,7 +77,8 @@ def run(self): r.max_seg_end = max_seg_end # select noise files for each recording - for i, r in enumerate(c.recordings): + recording_list = list(c.recordings) + for i, r in enumerate(recording_list): audio_name = r.audio target_length = r.max_seg_end reverbed_audio_name = "noised_" + audio_name.split("/")[-1] @@ -91,11 +92,11 @@ def run(self): noise_audios = [] while noise_length < target_length: - random_index = rng.randint(0, len(c.recordings) - 1) + random_index = rng.randint(0, len(recording_list) - 1) while random_index == i: - random_index = random.randint(0, len(c.recordings) - 1) - noise_audios.append(c.recordings[random_index]) - noise_length += c.recordings[random_index].max_seg_end + random_index = random.randint(0, len(recording_list) - 1) + noise_audios.append(recording_list[random_index]) + noise_length += recording_list[random_index].max_seg_end # create temp noise file temp_noise_track_file = "/dev/shm/{id}/tmp_concat_%i.wav" % n From 5ac2ec17eba10abe68fe482390ce6897dc1b96df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 10:24:23 -0400 Subject: [PATCH 26/39] Update include corpus --- lib/corpus.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 48b38803e..ed93ab52b 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -86,12 +86,12 @@ def startElement(self, name: str, attrs: Dict[str, str]): print( "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name) ) - for sc in c.subcorpora.values(): + for sc in c.subcorpora: sc.parent_corpus = e.parent_corpus - for r in c.recordings.values(): + for r in c.recordings: r.corpus = e - e.subcorpora.update(c.subcorpora) - e.recordings.update(c.recordings) + e._subcorpora.update(c.subcorpora) + e._recordings.update(c.recordings) e.speakers.update(c.speakers) elif name == "recording": assert isinstance(e, Corpus), " may only occur within a or element" From e388a2258ea8e81bc321a35dc999feb4fe5940b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 10:54:36 -0400 Subject: [PATCH 27/39] Add assertions that element must not exist in internal structure when adding it --- lib/corpus.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/corpus.py b/lib/corpus.py index ed93ab52b..68d0d62bf 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -267,16 +267,28 @@ def remove_recordings(self, recordings: List[Recording]): self.remove_recording(r) def add_recording(self, recording: Recording): + assert recording.fullname() not in self._recordings, ( + f"Tried to add recording {recording.fullname()} to corpus {self.fullname()}, " + "but the recording is already contained in the corpus." + ) assert isinstance(recording, Recording) recording.corpus = self self._recordings[recording.fullname()] = recording def add_subcorpus(self, corpus: Corpus): + assert corpus.fullname() not in self._subcorpora, ( + f"Tried to add subcorpus {corpus.fullname()} to corpus {self.fullname()}, " + "but the subcorpus is already contained in the corpus." + ) assert isinstance(corpus, Corpus) corpus.parent_corpus = self self._subcorpora[corpus.fullname()] = corpus def add_speaker(self, speaker: Speaker): + assert speaker.name not in self.speakers, ( + f"Tried to add speaker {speaker.name} to corpus {self.fullname()}, " + "but the speaker is already contained in the corpus." + ) assert isinstance(speaker, Speaker) self.speakers[speaker.name] = speaker @@ -441,6 +453,10 @@ def get_segment_by_name(self, name: str) -> Segment: return self._segments[name] def add_segment(self, segment: Segment): + assert segment.name not in self._segments, ( + f"Tried to add segment {segment.name} to recording {self.fullname()}, " + "but the segment is already contained in the recording." + ) assert self.corpus is not None, ( "The recording must be added to a corpus via Corpus.add_recording() before using Recording.add_segment()." ) From e0f947399966e79e86a2df59bb0c369f86e99334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 15 Jul 2025 11:15:44 -0400 Subject: [PATCH 28/39] Add docstring --- lib/corpus.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/corpus.py b/lib/corpus.py index 68d0d62bf..026e4259c 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -22,6 +22,9 @@ class NamedEntity: def __init__(self, name: Optional[str] = None): + """ + :param name: Name of the entity. + """ super().__init__() self.name = name From 20ad0ad5818a0e3adb8b434ac96c80b881cd2446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= <31628502+Icemole@users.noreply.github.com> Date: Wed, 16 Jul 2025 10:57:29 +0200 Subject: [PATCH 29/39] Apply suggestions from code review Co-authored-by: DanEnergetics --- corpus/filter.py | 2 +- lib/corpus.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/corpus/filter.py b/corpus/filter.py index 5f3cf2d03..eaf7a44ba 100644 --- a/corpus/filter.py +++ b/corpus/filter.py @@ -574,7 +574,7 @@ def __call__(self, corpus: corpus.Corpus, recording: corpus.Recording, segment: recordings_to_be_removed = [] for r in c.all_recordings(): num_seg = num_segments_per_recording[r.fullname()] - new_num_seg = len(list(r.segments)) + new_num_seg = len(r.segments) if num_seg and (num_seg - new_num_seg) / num_seg > self.recording_oov_tolerance: recordings_to_be_removed.append(r) diff --git a/lib/corpus.py b/lib/corpus.py index 026e4259c..f537ef4a1 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -260,7 +260,7 @@ def top_level_speakers(self) -> Iterable[Speaker]: yield from self.speakers.values() def remove_recording(self, recording: Recording): - if recording.name in self._recordings: + if recording.fullname() in self._recordings: del self._recordings[recording.fullname()] for sc in self.subcorpora.values(): sc.remove_recording(recording) @@ -417,7 +417,7 @@ def segments(self, value: List[Segment]): assert isinstance(value, list) and all(isinstance(s, Segment) for s in value), ( f"Can only set Recording.segments to a list, but tried setting it to {type(value)}." ) - self._recordings = {s.fullname(): s for s in value} + self._segments = {s.fullname(): s for s in value} def fullname(self) -> str: assert self.corpus is not None, ( @@ -456,7 +456,7 @@ def get_segment_by_name(self, name: str) -> Segment: return self._segments[name] def add_segment(self, segment: Segment): - assert segment.name not in self._segments, ( + assert segment.fullname() not in self._segments, ( f"Tried to add segment {segment.name} to recording {self.fullname()}, " "but the segment is already contained in the recording." ) From a12a430736da59bdd9fb5387505c7f3bb9a595ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Wed, 16 Jul 2025 05:00:22 -0400 Subject: [PATCH 30/39] Use name instead of full name --- lib/corpus.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index f537ef4a1..0ba5a48b1 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -194,7 +194,7 @@ def subcorpora(self, value: List[Corpus]): assert isinstance(value, list) and all(isinstance(c, Corpus) for c in value), ( f"Can only set Corpus.subcorpora to a list, but tried setting it to {type(value)}." ) - self._subcorpora = {c.fullname(): c for c in value} + self._subcorpora = {c.name: c for c in value} @property def recordings(self) -> Iterable[Recording]: @@ -212,7 +212,7 @@ def recordings(self, value: List[Recording]): assert isinstance(value, list) and all(isinstance(r, Recording) for r in value), ( f"Can only set Corpus.recordings to a list, but tried setting it to {type(value)}." ) - self._recordings = {r.fullname(): r for r in value} + self._recordings = {r.name: r for r in value} def segments(self) -> Iterable[Segment]: """ @@ -260,8 +260,8 @@ def top_level_speakers(self) -> Iterable[Speaker]: yield from self.speakers.values() def remove_recording(self, recording: Recording): - if recording.fullname() in self._recordings: - del self._recordings[recording.fullname()] + if recording.name in self._recordings: + del self._recordings[recording.name] for sc in self.subcorpora.values(): sc.remove_recording(recording) @@ -270,22 +270,22 @@ def remove_recordings(self, recordings: List[Recording]): self.remove_recording(r) def add_recording(self, recording: Recording): - assert recording.fullname() not in self._recordings, ( - f"Tried to add recording {recording.fullname()} to corpus {self.fullname()}, " + assert recording.name not in self._recordings, ( + f"Tried to add recording {recording.name} to corpus {self.fullname()}, " "but the recording is already contained in the corpus." ) assert isinstance(recording, Recording) recording.corpus = self - self._recordings[recording.fullname()] = recording + self._recordings[recording.name] = recording def add_subcorpus(self, corpus: Corpus): - assert corpus.fullname() not in self._subcorpora, ( - f"Tried to add subcorpus {corpus.fullname()} to corpus {self.fullname()}, " + assert corpus.name not in self._subcorpora, ( + f"Tried to add subcorpus {corpus.name} to corpus {self.fullname()}, " "but the subcorpus is already contained in the corpus." ) assert isinstance(corpus, Corpus) corpus.parent_corpus = self - self._subcorpora[corpus.fullname()] = corpus + self._subcorpora[corpus.name] = corpus def add_speaker(self, speaker: Speaker): assert speaker.name not in self.speakers, ( @@ -320,9 +320,7 @@ def filter_segments(self, filter_function: FilterFunction): :param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept """ for rec_full_name, r in self._recordings.items(): - self._recordings[rec_full_name]._segments = { - s.fullname(): s for s in r.segments if filter_function(self, r, s) - } + self._recordings[rec_full_name]._segments = {s.name: s for s in r.segments if filter_function(self, r, s)} for subcorpus_full_name in self._subcorpora(): self._subcorpora[subcorpus_full_name].filter_segments(filter_function) @@ -417,7 +415,7 @@ def segments(self, value: List[Segment]): assert isinstance(value, list) and all(isinstance(s, Segment) for s in value), ( f"Can only set Recording.segments to a list, but tried setting it to {type(value)}." ) - self._segments = {s.fullname(): s for s in value} + self._segments = {s.name: s for s in value} def fullname(self) -> str: assert self.corpus is not None, ( @@ -456,7 +454,7 @@ def get_segment_by_name(self, name: str) -> Segment: return self._segments[name] def add_segment(self, segment: Segment): - assert segment.fullname() not in self._segments, ( + assert segment.name not in self._segments, ( f"Tried to add segment {segment.name} to recording {self.fullname()}, " "but the segment is already contained in the recording." ) @@ -465,13 +463,11 @@ def add_segment(self, segment: Segment): ) assert isinstance(segment, Segment) segment.recording = self - self._segments[segment.fullname()] = segment + self._segments[segment.name] = segment def remove_segment(self, segment: Segment): - assert segment.fullname() in self._segments, ( - f"Segment '{segment.fullname()}' was not found in recording '{self.name}'" - ) - del self._segments[segment.fullname()] + assert segment.name in self._segments, f"Segment '{segment.name}' was not found in recording '{self.name}'" + del self._segments[segment.name] def get_segment_mapping(self) -> Dict[str, Segment]: """ From c03ec82a2291a7dde2c5be806f9e057923eac62c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Wed, 16 Jul 2025 05:02:24 -0400 Subject: [PATCH 31/39] Remove redundant conversion to list --- corpus/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corpus/filter.py b/corpus/filter.py index eaf7a44ba..054bd49a1 100644 --- a/corpus/filter.py +++ b/corpus/filter.py @@ -524,7 +524,7 @@ def maybe_to_lower(s): c = corpus.Corpus() c.load(self.corpus.get_path()) - num_segments_per_recording = {r.fullname(): len(list(r.segments)) for r in c.all_recordings()} + num_segments_per_recording = {r.fullname(): len(r.segments) for r in c.all_recordings()} # use var name instead of attribute to avoid problem with name scope log_oov_list = self.log_oov_list From 9278b1e8b1907f50a5f0d50808287ec62974d167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Wed, 16 Jul 2025 05:13:31 -0400 Subject: [PATCH 32/39] Improve retrieval of segments from corpus/recording Allow searching for base name in recording as well, and search in subcorpora when segment not found in main corpus --- lib/corpus.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 0ba5a48b1..9dc33547c 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -234,11 +234,15 @@ def get_segment_by_name(self, name: str) -> Segment: """ :return: the segment specified by its full name """ - recording_name, _ = name.rsplit("/", 1) - assert recording_name in self._recordings, ( - f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus" - ) - return self._recordings[recording_name].get_segment_by_name(name) + recording_name, segment_name = name.rsplit("/", maxsplit=1) + if recording_name in self._recordings: + return self._recordings[recording_name].get_segment_by_name(segment_name) + else: + subcorpus_name, segment_name = name.split("/", maxsplit=1) + assert subcorpus_name in self._subcorpora, ( + f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus" + ) + return self._subcorpora[subcorpus_name].get_segment_by_name(segment_name) def all_recordings(self) -> Iterable[Recording]: yield from self.recordings @@ -446,12 +450,13 @@ def dump(self, out: TextIO, indentation: str = ""): def get_segment_by_name(self, name: str) -> Segment: """ - :param name: Full name of the segment. + :param name: Name or full name of the segment. :return: Segment which is identified by the full name specified in :param:`name`. """ - assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'" - return self._segments[name] + _, segment_name = name.rsplit("/", maxsplit=1) + assert segment_name in self._segments, f"Segment '{segment_name}' was not found in recording '{self.name}'" + return self._segments[segment_name] def add_segment(self, segment: Segment): assert segment.name not in self._segments, ( From fdc7315680d44d552eac923ba5692e5b7ddb4fd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 11 Aug 2025 05:56:11 -0400 Subject: [PATCH 33/39] Use Corpus API --- lib/corpus.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 9dc33547c..7ea89f8d3 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -90,11 +90,9 @@ def startElement(self, name: str, attrs: Dict[str, str]): "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name) ) for sc in c.subcorpora: - sc.parent_corpus = e.parent_corpus + e.add_subcorpus(sc) for r in c.recordings: - r.corpus = e - e._subcorpora.update(c.subcorpora) - e._recordings.update(c.recordings) + e.add_recording(r) e.speakers.update(c.speakers) elif name == "recording": assert isinstance(e, Corpus), " may only occur within a or element" From be48a260f8c99bad88ae577ff16c139906d4cb10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 11 Aug 2025 06:15:12 -0400 Subject: [PATCH 34/39] Add attributes/types to base class --- lib/corpus.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 7ea89f8d3..3a999feb8 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -165,16 +165,20 @@ class Corpus(NamedEntity, CorpusSection): attribute is set. Corpora with include statements can be read but are written back as a single file. """ + name: Optional[str] + parent_corpus: Optional[Corpus] + _recordings: Dict[str, Recording] # recording-name: Recording + _subcorpora: Dict[str, Corpus] # corpus-name: Corpus + def __init__(self, name: Optional[str] = None): """ :param name: Corpus name. """ super().__init__(name=name) - self.parent_corpus: Optional[Corpus] = None - - self._subcorpora: Dict[str, Corpus] = {} # full-name: Corpus - self._recordings: Dict[str, Recording] = {} # full-name: Recording + self.parent_corpus = None + self._subcorpora = {} + self._recordings = {} @property def subcorpora(self) -> Iterable[Corpus]: @@ -390,6 +394,15 @@ def __repr__(self): class Recording(NamedEntity, CorpusSection): + """ + This class represents a recording in Bliss format. + """ + + name: Optional[str] + audio: Optional[str] + _segments: Dict[str, Segment] + corpus: Optional[Corpus] + def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corpus: Optional[Corpus] = None): """ :param name: Recording name. @@ -397,9 +410,9 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corp super().__init__(name=name) self.audio = audio + self._segments = {} if corpus: corpus.add_recording(self) - self._segments: Dict[str, Segment] = {} @property def segments(self) -> Iterable[Segment]: @@ -483,6 +496,20 @@ def __repr__(self): class Segment(NamedEntity): + """ + This class represents a segment in Bliss format. + """ + + name: Optional[str] + start: Optional[float] + end: Optional[float] + track: Optional[int] + orth: Optional[str] + left_context_orth: Optional[str] + right_context_orth: Optional[str] + speaker_name: Optional[str] + recording: Optional[Recording] + def __init__( self, *, From f7b42f8e431ad19ae40a3f212b19267591b0aaec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 11 Aug 2025 06:25:49 -0400 Subject: [PATCH 35/39] Various improvements to user class init Also add parent_corpus parameter to corpus init, add else clauses if related object is not provided --- lib/corpus.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 3a999feb8..eb7170344 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -170,16 +170,20 @@ class Corpus(NamedEntity, CorpusSection): _recordings: Dict[str, Recording] # recording-name: Recording _subcorpora: Dict[str, Corpus] # corpus-name: Corpus - def __init__(self, name: Optional[str] = None): + def __init__(self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus] = None): """ :param name: Corpus name. """ super().__init__(name=name) - self.parent_corpus = None self._subcorpora = {} self._recordings = {} + if parent_corpus: + self.parent_corpus.add_subcorpus(self) + else: + self.parent_corpus = None + @property def subcorpora(self) -> Iterable[Corpus]: """ @@ -403,7 +407,7 @@ class Recording(NamedEntity, CorpusSection): _segments: Dict[str, Segment] corpus: Optional[Corpus] - def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corpus: Optional[Corpus] = None): + def __init__(self, name: Optional[str] = None, *, audio: Optional[str] = None, corpus: Optional[Corpus] = None): """ :param name: Recording name. """ @@ -411,8 +415,11 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corp self.audio = audio self._segments = {} + if corpus: corpus.add_recording(self) + else: + self.corpus = None @property def segments(self) -> Iterable[Segment]: @@ -512,8 +519,8 @@ class Segment(NamedEntity): def __init__( self, - *, name: Optional[str] = None, + * start: float = 0.0, end: float = 0.0, track: Optional[int] = None, @@ -542,8 +549,11 @@ def __init__( self.left_context_orth = left_context_orth self.right_context_orth = right_context_orth self.speaker_name = speaker_name + if recording: recording.add_segment(self) + else: + self.recording = None def full_orth(self) -> str: """ From a160dc6abee81783c9ca891cbaa5b220b69f4d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 11 Aug 2025 06:27:03 -0400 Subject: [PATCH 36/39] Remove unneeded assertion --- lib/corpus.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index eb7170344..9f5fe9925 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -481,9 +481,6 @@ def add_segment(self, segment: Segment): f"Tried to add segment {segment.name} to recording {self.fullname()}, " "but the segment is already contained in the recording." ) - assert self.corpus is not None, ( - "The recording must be added to a corpus via Corpus.add_recording() before using Recording.add_segment()." - ) assert isinstance(segment, Segment) segment.recording = self self._segments[segment.name] = segment From 6a955e3d45469e6f2dce35a3cdd251b7ad3634a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 11 Aug 2025 06:29:25 -0400 Subject: [PATCH 37/39] Add comma --- lib/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/corpus.py b/lib/corpus.py index 9f5fe9925..477b3f2e7 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -517,7 +517,7 @@ class Segment(NamedEntity): def __init__( self, name: Optional[str] = None, - * + *, start: float = 0.0, end: float = 0.0, track: Optional[int] = None, From eda8ab2cc2b8252862776e8acd946c6c9f50cc94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 11 Aug 2025 06:50:54 -0400 Subject: [PATCH 38/39] Improve docstring --- lib/corpus.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 477b3f2e7..5fbb282b6 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -170,9 +170,13 @@ class Corpus(NamedEntity, CorpusSection): _recordings: Dict[str, Recording] # recording-name: Recording _subcorpora: Dict[str, Corpus] # corpus-name: Corpus - def __init__(self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus] = None): + def __init__( + self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus] = None, load_from: Optional[str] = None + ): """ :param name: Corpus name. + :param parent_corpus: If provided, `self` will be directly added to :param:`parent_corpus` as a subcorpus. + :param load_from: If provided, :func:`load` will be directly run with this parameter. """ super().__init__(name=name) @@ -184,6 +188,9 @@ def __init__(self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus else: self.parent_corpus = None + if load_from: + self.load(load_from) + @property def subcorpora(self) -> Iterable[Corpus]: """ @@ -410,6 +417,8 @@ class Recording(NamedEntity, CorpusSection): def __init__(self, name: Optional[str] = None, *, audio: Optional[str] = None, corpus: Optional[Corpus] = None): """ :param name: Recording name. + :param audio: Actual path to the audio file which contains the playable media. + :param corpus: If provided, `self` will be directly added to the recordings in :param:`corpus`. """ super().__init__(name=name) @@ -528,6 +537,7 @@ def __init__( recording: Optional[Recording] = None, ): """ + :param name: Segment name. :param start: Segment start. :param end: Segment end. :param track: Segment track/channel. @@ -535,7 +545,7 @@ def __init__( :param left_context_orth: Optional left context when aligning (specific for RASR alignment). :param right_context_orth: Optional right context when aligning (specific for RASR alignment). :param speaker_name: Speaker name. - :param recording: Recording in which the segment is embedded. + :param recording: If provided, `self` will be directly added to the segments in :param:`recording`. """ super().__init__(name=name) From a22e8bfbf4bc86b64085caa35fa00d8a8a00fb8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Tue, 12 Aug 2025 05:00:56 -0400 Subject: [PATCH 39/39] Work Better code on XML parser, docstring improvements, fixes on get_*_by_name through testing --- lib/corpus.py | 63 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 5fbb282b6..61b73c09a 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -82,9 +82,7 @@ def startElement(self, name: str, attrs: Dict[str, str]): self.elements.append(subcorpus) elif name == "include": assert isinstance(e, Corpus), " may only occur within a or element" - path = os.path.join(os.path.dirname(self.path), attrs["file"]) - c = Corpus() - c.load(path) + c = Corpus(load_from=os.path.join(os.path.dirname(self.path), attrs["file"])) if c.name != e.name: print( "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name) @@ -96,8 +94,7 @@ def startElement(self, name: str, attrs: Dict[str, str]): e.speakers.update(c.speakers) elif name == "recording": assert isinstance(e, Corpus), " may only occur within a or element" - rec = Recording(name=attrs["name"], audio=attrs["audio"]) - e.add_recording(rec) + rec = Recording(name=attrs["name"], audio=attrs["audio"], corpus=e) self.elements.append(rec) elif name == "segment": assert isinstance(e, Recording), " may only occur within a element" @@ -175,8 +172,8 @@ def __init__( ): """ :param name: Corpus name. - :param parent_corpus: If provided, `self` will be directly added to :param:`parent_corpus` as a subcorpus. - :param load_from: If provided, :func:`load` will be directly run with this parameter. + :param parent_corpus: If provided, `self` will be directly linked as a subcorpus of :param:`parent_corpus`. + :param load_from: If provided, :func:`Corpus.load` will be directly run with this parameter. """ super().__init__(name=name) @@ -238,24 +235,47 @@ def segments(self) -> Iterable[Segment]: def get_recording_by_name(self, name: str) -> Recording: """ - :return: the recording specified by its full name + :return: the recording specified by its name relative to `self`. """ - assert name in self._recordings, f"Recording '{name}' was not found in corpus" - return self._recordings[name] + if "/" not in name: + assert name in self._recordings, f"Recording '{name}' was not found in corpus." + return self._recordings[name] + else: + subcorpus_name, recording_relative_name = name.split("/", maxsplit=1) + return self.get_subcorpus_by_name(subcorpus_name).get_recording_by_name(recording_relative_name) + + def get_subcorpus_by_name(self, name: str) -> Corpus: + """ + :return: The corpus specified by its name relative to `self`. + """ + if "/" not in name: + assert name in self._subcorpora, f"Subcorpus '{name}' was not found in corpus." + return self._subcorpora[name] + else: + _, subcorpus_relative_name = name.split("/", maxsplit=1) + return self.get_subcorpus_by_name(subcorpus_relative_name) def get_segment_by_name(self, name: str) -> Segment: """ - :return: the segment specified by its full name + :param name: Segment name relative to the corpus. + Note that it must be at least two levels deep, to also include the recording name. + Example: `my_recording/my_segment`. + :return: the segment specified by its name relative to `self`. """ - recording_name, segment_name = name.rsplit("/", maxsplit=1) + assert "/" in name, ( + "When running Corpus.get_segment_by_name(), at least two levels of depth 'recording/segment' " + "must be provided, separated with '/'." + ) + recording_name, segment_name = name.split("/", maxsplit=1) if recording_name in self._recordings: - return self._recordings[recording_name].get_segment_by_name(segment_name) + return self.get_recording_by_name(recording_name).get_segment_by_name(segment_name) else: - subcorpus_name, segment_name = name.split("/", maxsplit=1) + # The first part is the subcorpus, and the second is the rest of the segment. + subcorpus_name, segment_relative_name = recording_name, segment_name assert subcorpus_name in self._subcorpora, ( - f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus" + f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus." ) - return self._subcorpora[subcorpus_name].get_segment_by_name(segment_name) + return self.get_subcorpus_by_name(subcorpus_name).get_segment_by_name(segment_relative_name) def all_recordings(self) -> Iterable[Recording]: yield from self.recordings @@ -418,7 +438,7 @@ def __init__(self, name: Optional[str] = None, *, audio: Optional[str] = None, c """ :param name: Recording name. :param audio: Actual path to the audio file which contains the playable media. - :param corpus: If provided, `self` will be directly added to the recordings in :param:`corpus`. + :param corpus: If provided, `self` will be directly linked as a recording of :param:`corpus`. """ super().__init__(name=name) @@ -481,9 +501,12 @@ def get_segment_by_name(self, name: str) -> Segment: :return: Segment which is identified by the full name specified in :param:`name`. """ - _, segment_name = name.rsplit("/", maxsplit=1) - assert segment_name in self._segments, f"Segment '{segment_name}' was not found in recording '{self.name}'" - return self._segments[segment_name] + assert "/" not in name, ( + "Depth levels 'recording/segment' are not supported for Recording.get_segment_by_name(). " + "Use Corpus.get_segment_by_name() instead." + ) + assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'" + return self._segments[name] def add_segment(self, segment: Segment): assert segment.name not in self._segments, (