From 81cec266328755b17c0aea7c754f61884c0416fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Thu, 10 Jul 2025 03:24:13 -0400
Subject: [PATCH 01/39] Convert corpus structure to dict

---
 lib/corpus.py | 86 +++++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 48 deletions(-)
diff --git a/lib/corpus.py b/lib/corpus.py
index 6e0b6c37e..2b8f666cc 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -45,7 +45,7 @@ class CorpusParser(sax.handler.ContentHandler):
     """
     This classes methods are called by the sax-parser whenever it encounters an event in the xml-file
     (tags/characters/namespaces/...). It uses a stack of elements to remember the part of the corpus that
-    is currently beeing read.
+    is currently being read.
     """
 
     def __init__(self, corpus: Corpus, path: str, *, reformat_orth: bool = True):
@@ -77,7 +77,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
             subcorpus = Corpus()
             subcorpus.name = attrs["name"]
             subcorpus.parent_corpus = e
-            e.subcorpora.append(subcorpus)
+            e.subcorpora[subcorpus.name] = subcorpus
             self.elements.append(subcorpus)
         elif name == "include":
             assert isinstance(e, Corpus), "<include> may only occur within a <corpus> or <subcorpus> element"
@@ -88,12 +88,12 @@ def startElement(self, name: str, attrs: Dict[str, str]):
                 print(
                     "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name)
                 )
-            for sc in c.subcorpora:
+            for sc in c.subcorpora.values():
                 sc.parent_corpus = e.parent_corpus
-            for r in c.recordings:
+            for r in c.recordings.values():
                 r.corpus = e
-            e.subcorpora.extend(c.subcorpora)
-            e.recordings.extend(c.recordings)
+            e.subcorpora.update(c.subcorpora)
+            e.recordings.update(c.recordings)
             e.speakers.update(c.speakers)
         elif name == "recording":
             assert isinstance(e, Corpus), "<recording> may only occur within a <corpus> or <subcorpus> element"
@@ -173,85 +173,75 @@ def __init__(self):
 
         self.parent_corpus: Optional[Corpus] = None
 
-        self.subcorpora: List[Corpus] = []
-        self.recordings: List[Recording] = []
+        self.subcorpora: Dict[str, Corpus] = {}  # full-name: Corpus
+        self.recordings: Dict[str, Recording] = {}  # full-name: Recording
 
     def segments(self) -> Iterable[Segment]:
         """
         :return: an iterator over all segments within the corpus
         """
-        for r in self.recordings:
+        for r in self.recordings.values():
             yield from r.segments
-        for sc in self.subcorpora:
+        for sc in self.subcorpora.values():
             yield from sc.segments()
 
     def get_recording_by_name(self, name: str) -> Recording:
         """
-        :return: the recording specified by its name
+        :return: the recording specified by its full name
         """
-        for rec in self.all_recordings():
-            if rec.fullname() == name:
-                return rec
-        assert False, f"Recording '{name}' was not found in corpus"
+        assert name in self.recordings, f"Recording '{name}' was not found in corpus"
+
+        return self.recordings[name]
 
     def get_segment_by_name(self, name: str) -> Segment:
         """
-        :return: the segment specified by its name
+        :return: the segment specified by its full name
         """
-        for seg in self.segments():
-            if seg.fullname() == name:
-                return seg
-        assert False, f"Segment '{name}' was not found in corpus"
+        recording_name = "/".join(name.split("/")[:-1])
+        assert recording_name in self.recordings, (
+            f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus"
+        )
+
+        return self.recordings[recording_name].get_segment_by_name(name)
 
     def all_recordings(self) -> Iterable[Recording]:
-        yield from self.recordings
-        for sc in self.subcorpora:
+        yield from self.recordings.values()
+        for sc in self.subcorpora.values():
             yield from sc.all_recordings()
 
     def all_speakers(self) -> Iterable[Speaker]:
         yield from self.speakers.values()
-        for sc in self.subcorpora:
+        for sc in self.subcorpora.values():
             yield from sc.all_speakers()
 
     def top_level_recordings(self) -> Iterable[Recording]:
-        yield from self.recordings
+        yield from self.recordings.values()
 
     def top_level_subcorpora(self) -> Iterable[Corpus]:
-        yield from self.subcorpora
+        yield from self.subcorpora.values()
 
     def top_level_speakers(self) -> Iterable[Speaker]:
         yield from self.speakers.values()
 
     def remove_recording(self, recording: Recording):
-        to_delete = []
-        for idx, r in enumerate(self.recordings):
-            if r is recording or r == recording or r.name == recording:
-                to_delete.append(idx)
-        for idx in reversed(to_delete):
-            del self.recordings[idx]
-        for sc in self.subcorpora:
+        if recording.name in self.recordings:
+            del self.recordings[recording.fullname()]
+        for sc in self.subcorpora.values():
             sc.remove_recording(recording)
 
     def remove_recordings(self, recordings: List[Recording]):
-        recording_fullnames = {recording.fullname() for recording in recordings}
-        to_delete = []
-        for idx, r in enumerate(self.recordings):
-            if r.fullname() in recording_fullnames:
-                to_delete.append(idx)
-        for idx in reversed(to_delete):
-            del self.recordings[idx]
-        for sc in self.subcorpora:
-            sc.remove_recordings(recordings)
+        for r in recordings:
+            self.remove_recording(r)
 
     def add_recording(self, recording: Recording):
         assert isinstance(recording, Recording)
         recording.corpus = self
-        self.recordings.append(recording)
+        self.recordings[recording.fullname()] = recording
 
     def add_subcorpus(self, corpus: Corpus):
         assert isinstance(corpus, Corpus)
         corpus.parent_corpus = self
-        self.subcorpora.append(corpus)
+        self.subcorpora[corpus.fullname()] = corpus
 
     def add_speaker(self, speaker: Speaker):
         assert isinstance(speaker, Speaker)
@@ -281,9 +271,9 @@ def filter_segments(self, filter_function: FilterFunction):
         filter all segments (including in subcorpora) using filter_function
         :param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept
         """
-        for r in self.recordings:
-            r.segments = [s for s in r.segments if filter_function(self, r, s)]
-        for sc in self.subcorpora:
+        for r in self.recordings.values():
+            r.segments = {s.fullname(): s for s in r.segments.values() if filter_function(self, r, s)}
+        for sc in self.subcorpora.values():
             sc.filter_segments(filter_function)
 
     def load(self, path: str, *, reformat_orth: bool = True):
@@ -322,10 +312,10 @@ def _dump_internal(self, out: TextIO, indentation: str = ""):
         if self.speaker_name is not None:
             out.write('%s  <speaker name="%s"/>\n' % (indentation, self.speaker_name))
 
-        for r in self.recordings:
+        for r in self.recordings.values():
             r.dump(out, indentation + "  ")
 
-        for sc in self.subcorpora:
+        for sc in self.subcorpora.values():
             sc._dump_internal(out, indentation + "  ")
 
         if self.parent_corpus is None:

From 3ba9608dd3f3cd09c84cd449b3b11861a377465c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Thu, 10 Jul 2025 03:24:27 -0400
Subject: [PATCH 02/39] Convert recording structure to dict

---
 lib/corpus.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 2b8f666cc..226e2f0fd 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -344,7 +344,7 @@ def __init__(self):
         super().__init__()
         self.audio: Optional[str] = None
         self.corpus: Optional[Corpus] = None
-        self.segments: List[Segment] = []
+        self.segments: Dict[str, Segment] = {}
 
     def fullname(self) -> str:
         return self.corpus.fullname() + "/" + self.name
@@ -365,21 +365,26 @@ def dump(self, out: TextIO, indentation: str = ""):
         if self.speaker_name is not None:
             out.write('%s  <speaker name="%s"/>\n' % (indentation, self.speaker_name))
 
-        for s in self.segments:
+        for s in self.segments.values():
             s.dump(out, indentation + "  ")
 
         out.write("%s</recording>\n" % indentation)
 
+    def get_segment_by_name(self, name: str):
+        assert name in self.segments, f"Segment '{name}' was not found in recording '{self.name}'"
+
+        return self.segments[name]
+
     def add_segment(self, segment: Segment):
         assert isinstance(segment, Segment)
         segment.recording = self
-        self.segments.append(segment)
+        self.segments[segment.fullname()] = segment
 
     def get_segment_mapping(self) -> Dict[str, Segment]:
         """
         :return: Mapping from segment fullnames to actual segments.
         """
-        return {seg.fullname(): seg for seg in self.segments}
+        return {seg.fullname(): seg for seg in self.segments.values()}
 
     def __repr__(self):
         return f"<{self.__class__.__name__} {self.fullname()}>"

From b066017304744a2d026654035904a62faf9e0f13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Thu, 10 Jul 2025 03:53:30 -0400
Subject: [PATCH 03/39] Fix `Corpus.segments()` call

Now return the actual segments instead of the segment full names, following the previous commit
---
 lib/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 226e2f0fd..8833f2070 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -181,7 +181,7 @@ def segments(self) -> Iterable[Segment]:
         :return: an iterator over all segments within the corpus
         """
         for r in self.recordings.values():
-            yield from r.segments
+            yield from r.segments.values()
         for sc in self.subcorpora.values():
             yield from sc.segments()
 

From 16c7271ec84fffb42ea2ca8419aa86d5924f897c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <31628502+Icemole@users.noreply.github.com>
Date: Thu, 10 Jul 2025 10:17:18 +0200
Subject: [PATCH 04/39] Use `rsplit` instead of splitting and concatenating
 back

Co-authored-by: Albert Zeyer <zeyer@cs.rwth-aachen.de>
---
 lib/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 8833f2070..da45669a4 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -197,7 +197,7 @@ def get_segment_by_name(self, name: str) -> Segment:
         """
         :return: the segment specified by its full name
         """
-        recording_name = "/".join(name.split("/")[:-1])
+        recording_name, _ = name.rsplit("/", 1)
         assert recording_name in self.recordings, (
             f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus"
         )

From 28ce8893b04fd1eea91151a55753c65e35965af0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 14 Jul 2025 06:26:34 -0400
Subject: [PATCH 05/39] Add recording at the beginning

---
 tests/job_tests/corpus/test_transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/job_tests/corpus/test_transform.py b/tests/job_tests/corpus/test_transform.py
index 9b71e54c6..f074a7afa 100644
--- a/tests/job_tests/corpus/test_transform.py
+++ b/tests/job_tests/corpus/test_transform.py
@@ -30,13 +30,13 @@ def _create_corpus_with_structure(corpus_dict: Dict[str, Any]) -> libcorpus.Corp
         corpus.name = corpus_dict["name"]
         for recording_dict in corpus_dict.get("recordings", []):
             recording = libcorpus.Recording()
+            corpus.add_recording(recording)
             recording.name = recording_dict["name"]
             for segment_name in recording_dict.get("segments", []):
                 segment = libcorpus.Segment()
                 segment.name = segment_name
                 segment.orth = ""
                 recording.add_segment(segment)
-            corpus.add_recording(recording)
         for subcorpus_dict in corpus_dict.get("subcorpora", []):
             corpus.add_subcorpus(_CorpusCreatorHelper._create_corpus_with_structure(subcorpus_dict))
 

From 14247a5143888ca407d98d665cf7990e0428b53b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 14 Jul 2025 06:37:12 -0400
Subject: [PATCH 06/39] Fix

---
 tests/job_tests/corpus/test_transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/job_tests/corpus/test_transform.py b/tests/job_tests/corpus/test_transform.py
index f074a7afa..db9f517e5 100644
--- a/tests/job_tests/corpus/test_transform.py
+++ b/tests/job_tests/corpus/test_transform.py
@@ -30,8 +30,8 @@ def _create_corpus_with_structure(corpus_dict: Dict[str, Any]) -> libcorpus.Corp
         corpus.name = corpus_dict["name"]
         for recording_dict in corpus_dict.get("recordings", []):
             recording = libcorpus.Recording()
-            corpus.add_recording(recording)
             recording.name = recording_dict["name"]
+            corpus.add_recording(recording)
             for segment_name in recording_dict.get("segments", []):
                 segment = libcorpus.Segment()
                 segment.name = segment_name

From 4cd832ac43ab84b9b1975ea8967cceb88d54303f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 14 Jul 2025 06:39:46 -0400
Subject: [PATCH 07/39] Add name to NamedEntity/Corpus/Recording/Segment init

---
 lib/corpus.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index da45669a4..c6b9feec2 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -21,9 +21,9 @@
 
 
 class NamedEntity:
-    def __init__(self):
+    def __init__(self, name: Optional[str] = None):
         super().__init__()
-        self.name: Optional[str] = None
+        self.name = name
 
     def __repr__(self):
         if self.name is None:
@@ -168,8 +168,11 @@ class Corpus(NamedEntity, CorpusSection):
     attribute is set. Corpora with include statements can be read but are written back as a single file.
     """
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name: Optional[str] = None):
+        """
+        :param name: Corpus name.
+        """
+        super().__init__(name=name)
 
         self.parent_corpus: Optional[Corpus] = None
 
@@ -340,8 +343,12 @@ def __repr__(self):
 
 
 class Recording(NamedEntity, CorpusSection):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name: Optional[str] = None):
+        """
+        :param name: Recording name.
+        """
+        super().__init__(name=name)
+
         self.audio: Optional[str] = None
         self.corpus: Optional[Corpus] = None
         self.segments: Dict[str, Segment] = {}
@@ -394,6 +401,7 @@ class Segment(NamedEntity):
     def __init__(
         self,
         *,
+        name: Optional[str] = None,
         start: float = 0.0,
         end: float = 0.0,
         track: Optional[int] = None,
@@ -413,7 +421,7 @@ def __init__(
         :param speaker_name: Speaker name.
         :param recording: Recording in which the segment is embedded.
         """
-        super().__init__()
+        super().__init__(name=name)
 
         self.start = start
         self.end = end

From dc8b4e4288de7bdee34d325f1b6ac87ae9538d4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 14 Jul 2025 06:41:49 -0400
Subject: [PATCH 08/39] Use newly declared parameters in init

---
 tests/job_tests/corpus/test_transform.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/job_tests/corpus/test_transform.py b/tests/job_tests/corpus/test_transform.py
index db9f517e5..036d0b442 100644
--- a/tests/job_tests/corpus/test_transform.py
+++ b/tests/job_tests/corpus/test_transform.py
@@ -26,16 +26,12 @@ def _create_corpus_with_structure(corpus_dict: Dict[str, Any]) -> libcorpus.Corp
         :param corpus_dict: Definition of a corpus in dictionary form.
         :return: Corpus object defined by the corpus dictionary provided.
         """
-        corpus = libcorpus.Corpus()
-        corpus.name = corpus_dict["name"]
+        corpus = libcorpus.Corpus(name=corpus_dict["name"])
         for recording_dict in corpus_dict.get("recordings", []):
-            recording = libcorpus.Recording()
-            recording.name = recording_dict["name"]
+            recording = libcorpus.Recording(name=recording_dict["name"])
             corpus.add_recording(recording)
             for segment_name in recording_dict.get("segments", []):
-                segment = libcorpus.Segment()
-                segment.name = segment_name
-                segment.orth = ""
+                segment = libcorpus.Segment(name=segment_name, orth="")
                 recording.add_segment(segment)
         for subcorpus_dict in corpus_dict.get("subcorpora", []):
             corpus.add_subcorpus(_CorpusCreatorHelper._create_corpus_with_structure(subcorpus_dict))

From 9d2387221ac93d7537cf535044ae9651dfa104d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <31628502+Icemole@users.noreply.github.com>
Date: Tue, 15 Jul 2025 11:07:26 +0200
Subject: [PATCH 09/39] Directly copy self segments

Co-authored-by: DanEnergetics <d.mann95.dm@gmail.com>
---
 lib/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index c6b9feec2..08d7ae1fc 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -391,7 +391,7 @@ def get_segment_mapping(self) -> Dict[str, Segment]:
         """
         :return: Mapping from segment fullnames to actual segments.
         """
-        return {seg.fullname(): seg for seg in self.segments.values()}
+        return self.segments.copy()
 
     def __repr__(self):
         return f"<{self.__class__.__name__} {self.fullname()}>"

From a673f833c17edd87b895a75d9341b09a1a84eb6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 05:46:36 -0400
Subject: [PATCH 10/39] Better init

---
 lib/corpus.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 08d7ae1fc..0591d7066 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -74,10 +74,8 @@ def startElement(self, name: str, attrs: Dict[str, str]):
             e.name = attrs["name"]
         elif name == "subcorpus":
             assert isinstance(e, Corpus), "<subcorpus> may only occur within a <corpus> or <subcorpus> element"
-            subcorpus = Corpus()
-            subcorpus.name = attrs["name"]
-            subcorpus.parent_corpus = e
-            e.subcorpora[subcorpus.name] = subcorpus
+            subcorpus = Corpus(name=attrs["name"])
+            e.add_subcorpus(subcorpus)
             self.elements.append(subcorpus)
         elif name == "include":
             assert isinstance(e, Corpus), "<include> may only occur within a <corpus> or <subcorpus> element"
@@ -97,9 +95,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
             e.speakers.update(c.speakers)
         elif name == "recording":
             assert isinstance(e, Corpus), "<recording> may only occur within a <corpus> or <subcorpus> element"
-            rec = Recording()
-            rec.name = attrs["name"]
-            rec.audio = attrs["audio"]
+            rec = Recording(name=attrs["name"], audio=attrs["audio"])
             e.add_recording(rec)
             self.elements.append(rec)
         elif name == "segment":

From 1ebeb7db00a7d05928e3dfd0be4f1b85f67576cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 05:46:59 -0400
Subject: [PATCH 11/39] Corpus: add subcorpora, recordings properties as read
 only

---
 lib/corpus.py | 52 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 0591d7066..0f5ad86ef 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -172,59 +172,71 @@ def __init__(self, name: Optional[str] = None):
 
         self.parent_corpus: Optional[Corpus] = None
 
-        self.subcorpora: Dict[str, Corpus] = {}  # full-name: Corpus
-        self.recordings: Dict[str, Recording] = {}  # full-name: Recording
+        self._subcorpora: Dict[str, Corpus] = {}  # full-name: Corpus
+        self._recordings: Dict[str, Recording] = {}  # full-name: Recording
+
+    @property
+    def subcorpora(self):
+        """
+        Read-only property. If one wants to add a subcorpus to this corpus, please use :func:`Corpus.add_subcorpus`.
+        """
+        return self._subcorpora.values()
+
+    @property
+    def recordings(self):
+        """
+        Read-only property. If one wants to add a recording to this corpus, please use :func:`Corpus.add_recording`.
+        """
+        return self._recordings.values()
 
     def segments(self) -> Iterable[Segment]:
         """
         :return: an iterator over all segments within the corpus
         """
-        for r in self.recordings.values():
-            yield from r.segments.values()
-        for sc in self.subcorpora.values():
+        for r in self.recordings:
+            yield from r.segments
+        for sc in self.subcorpora:
             yield from sc.segments()
 
     def get_recording_by_name(self, name: str) -> Recording:
         """
         :return: the recording specified by its full name
         """
-        assert name in self.recordings, f"Recording '{name}' was not found in corpus"
-
-        return self.recordings[name]
+        assert name in self._recordings, f"Recording '{name}' was not found in corpus"
+        return self._recordings[name]
 
     def get_segment_by_name(self, name: str) -> Segment:
         """
         :return: the segment specified by its full name
         """
         recording_name, _ = name.rsplit("/", 1)
-        assert recording_name in self.recordings, (
+        assert recording_name in self._recordings, (
             f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus"
         )
-
-        return self.recordings[recording_name].get_segment_by_name(name)
+        return self._recordings[recording_name].get_segment_by_name(name)
 
     def all_recordings(self) -> Iterable[Recording]:
-        yield from self.recordings.values()
-        for sc in self.subcorpora.values():
+        yield from self.recordings
+        for sc in self.subcorpora:
             yield from sc.all_recordings()
 
     def all_speakers(self) -> Iterable[Speaker]:
         yield from self.speakers.values()
-        for sc in self.subcorpora.values():
+        for sc in self.subcorpora:
             yield from sc.all_speakers()
 
     def top_level_recordings(self) -> Iterable[Recording]:
-        yield from self.recordings.values()
+        yield from self.recordings
 
     def top_level_subcorpora(self) -> Iterable[Corpus]:
-        yield from self.subcorpora.values()
+        yield from self.subcorpora
 
     def top_level_speakers(self) -> Iterable[Speaker]:
         yield from self.speakers.values()
 
     def remove_recording(self, recording: Recording):
-        if recording.name in self.recordings:
-            del self.recordings[recording.fullname()]
+        if recording.name in self._recordings:
+            del self._recordings[recording.fullname()]
         for sc in self.subcorpora.values():
             sc.remove_recording(recording)
 
@@ -235,12 +247,12 @@ def remove_recordings(self, recordings: List[Recording]):
     def add_recording(self, recording: Recording):
         assert isinstance(recording, Recording)
         recording.corpus = self
-        self.recordings[recording.fullname()] = recording
+        self._recordings[recording.fullname()] = recording
 
     def add_subcorpus(self, corpus: Corpus):
         assert isinstance(corpus, Corpus)
         corpus.parent_corpus = self
-        self.subcorpora[corpus.fullname()] = corpus
+        self._subcorpora[corpus.fullname()] = corpus
 
     def add_speaker(self, speaker: Speaker):
         assert isinstance(speaker, Speaker)

From 1e678e83ece4348745bdc5ce1706effd85840c33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 05:48:23 -0400
Subject: [PATCH 12/39] Update filter segments function

---
 lib/corpus.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 0f5ad86ef..587ad7677 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -282,10 +282,12 @@ def filter_segments(self, filter_function: FilterFunction):
         filter all segments (including in subcorpora) using filter_function
         :param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept
         """
-        for r in self.recordings.values():
-            r.segments = {s.fullname(): s for s in r.segments.values() if filter_function(self, r, s)}
-        for sc in self.subcorpora.values():
-            sc.filter_segments(filter_function)
+        for rec_full_name, r in self._recordings.items():
+            self._recordings[rec_full_name]._segments = {
+                s.fullname(): s for s in r.segments.values() if filter_function(self, r, s)
+            }
+        for subcorpus_full_name in self._subcorpora():
+            self._subcorpora[subcorpus_full_name].filter_segments(filter_function)
 
     def load(self, path: str, *, reformat_orth: bool = True):
         """

From de59bece355f9d51374fef442d87e2e1d04b6658 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 05:48:47 -0400
Subject: [PATCH 13/39] Corpus: add subcorpora, recordings as properties (2)

---
 lib/corpus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 587ad7677..e0b18b647 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -325,10 +325,10 @@ def _dump_internal(self, out: TextIO, indentation: str = ""):
         if self.speaker_name is not None:
             out.write('%s  <speaker name="%s"/>\n' % (indentation, self.speaker_name))
 
-        for r in self.recordings.values():
+        for r in self.recordings:
             r.dump(out, indentation + "  ")
 
-        for sc in self.subcorpora.values():
+        for sc in self.subcorpora:
             sc._dump_internal(out, indentation + "  ")
 
         if self.parent_corpus is None:

From b6003a3a768cff165ae699e2d77a060388d2b6d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 05:49:01 -0400
Subject: [PATCH 14/39] Recording: add segments as property

---
 lib/corpus.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index e0b18b647..139fbcd68 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -353,17 +353,24 @@ def __repr__(self):
 
 
 class Recording(NamedEntity, CorpusSection):
-    def __init__(self, name: Optional[str] = None):
+    def __init__(self, name: Optional[str] = None, audio: Optional[str] = None):
         """
         :param name: Recording name.
         """
         super().__init__(name=name)
 
-        self.audio: Optional[str] = None
+        self.audio = audio
         self.corpus: Optional[Corpus] = None
-        self.segments: Dict[str, Segment] = {}
+        self._segments: Dict[str, Segment] = {}
+
+    @property
+    def segments(self):
+        return self._segments.values()
 
     def fullname(self) -> str:
+        assert self.corpus is not None, (
+            "Please add the recording to a corpus via Corpus.add_recording() before triggering fullname()."
+        )
         return self.corpus.fullname() + "/" + self.name
 
     def speaker(self, speaker_name: Optional[str] = None) -> Speaker:
@@ -382,26 +389,25 @@ def dump(self, out: TextIO, indentation: str = ""):
         if self.speaker_name is not None:
             out.write('%s  <speaker name="%s"/>\n' % (indentation, self.speaker_name))
 
-        for s in self.segments.values():
+        for s in self.segments:
             s.dump(out, indentation + "  ")
 
         out.write("%s</recording>\n" % indentation)
 
     def get_segment_by_name(self, name: str):
-        assert name in self.segments, f"Segment '{name}' was not found in recording '{self.name}'"
-
-        return self.segments[name]
+        assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'"
+        return self._segments[name]
 
     def add_segment(self, segment: Segment):
         assert isinstance(segment, Segment)
         segment.recording = self
-        self.segments[segment.fullname()] = segment
+        self._segments[segment.fullname()] = segment
 
     def get_segment_mapping(self) -> Dict[str, Segment]:
         """
         :return: Mapping from segment fullnames to actual segments.
         """
-        return self.segments.copy()
+        return self._segments.copy()
 
     def __repr__(self):
         return f"<{self.__class__.__name__} {self.fullname()}>"

From a1ea859f23559cdebbcd97560bbd809d7f491b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 05:49:30 -0400
Subject: [PATCH 15/39] Segment: add assertion in fullname

---
 lib/corpus.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/corpus.py b/lib/corpus.py
index 139fbcd68..aeeda2567 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -456,6 +456,9 @@ def full_orth(self) -> str:
         return " ".join([s for s in [self.left_context_orth, self.orth, self.right_context_orth] if s])
 
     def fullname(self) -> str:
+        assert self.recording is not None, (
+            "Please add the recording to a corpus via Recording.add_segment() before triggering Segment.fullname()."
+        )
         return self.recording.fullname() + "/" + self.name
 
     def speaker(self) -> Speaker:

From 6d41c465026b99f1a5cadfe8e4a44bf2b29052a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 05:58:01 -0400
Subject: [PATCH 16/39] Always return iterables

---
 lib/corpus.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index aeeda2567..1dfd8fb97 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -176,14 +176,14 @@ def __init__(self, name: Optional[str] = None):
         self._recordings: Dict[str, Recording] = {}  # full-name: Recording
 
     @property
-    def subcorpora(self):
+    def subcorpora(self) -> Iterable[Corpus]:
         """
         Read-only property. If one wants to add a subcorpus to this corpus, please use :func:`Corpus.add_subcorpus`.
         """
         return self._subcorpora.values()
 
     @property
-    def recordings(self):
+    def recordings(self) -> Iterable[Recording]:
         """
         Read-only property. If one wants to add a recording to this corpus, please use :func:`Corpus.add_recording`.
         """
@@ -364,7 +364,10 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None):
         self._segments: Dict[str, Segment] = {}
 
     @property
-    def segments(self):
+    def segments(self) -> Iterable[Segment]:
+        """
+        Read-only property. If one wants to add a segment to this recording, please use :func:`Recording.add_segment`.
+        """
         return self._segments.values()
 
     def fullname(self) -> str:

From d5fa7a942d280be03b8f04461ca94de5f457bf30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 06:26:08 -0400
Subject: [PATCH 17/39] Set explicit read only properties

---
 lib/corpus.py | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 1dfd8fb97..8101807b9 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -178,17 +178,35 @@ def __init__(self, name: Optional[str] = None):
     @property
     def subcorpora(self) -> Iterable[Corpus]:
         """
-        Read-only property. If one wants to add a subcorpus to this corpus, please use :func:`Corpus.add_subcorpus`.
+        Read-only property.
+
+        :return: Iterable of all top-level subcorpora.
         """
         return self._subcorpora.values()
 
+    @subcorpora.setter
+    def subcorpora(self):
+        raise AttributeError(
+            "Corpus.subcorpora is a read-only attribute. "
+            "If you're using old code, please use the current proper API function."
+        )
+
     @property
     def recordings(self) -> Iterable[Recording]:
         """
-        Read-only property. If one wants to add a recording to this corpus, please use :func:`Corpus.add_recording`.
+        Read-only property.
+
+        :return: Iterable of all top-level recordings.
         """
         return self._recordings.values()
 
+    @recordings.setter
+    def recordings(self):
+        raise AttributeError(
+            "Corpus.recordings is a read-only attribute. "
+            "If you're using old code, please use the current proper API function."
+        )
+
     def segments(self) -> Iterable[Segment]:
         """
         :return: an iterator over all segments within the corpus
@@ -353,14 +371,15 @@ def __repr__(self):
 
 
 class Recording(NamedEntity, CorpusSection):
-    def __init__(self, name: Optional[str] = None, audio: Optional[str] = None):
+    def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corpus: Optional[Corpus] = None):
         """
         :param name: Recording name.
         """
         super().__init__(name=name)
 
         self.audio = audio
-        self.corpus: Optional[Corpus] = None
+        if corpus:
+            corpus.add_recording(self)
         self._segments: Dict[str, Segment] = {}
 
     @property
@@ -370,6 +389,13 @@ def segments(self) -> Iterable[Segment]:
         """
         return self._segments.values()
 
+    @segments.setter
+    def segments(self):
+        raise AttributeError(
+            "Recording.segments is a read-only property. "
+            "If you're using old code, please use the current proper API function."
+        )
+
     def fullname(self) -> str:
         assert self.corpus is not None, (
             "Please add the recording to a corpus via Corpus.add_recording() before triggering fullname()."
@@ -402,6 +428,9 @@ def get_segment_by_name(self, name: str):
         return self._segments[name]
 
     def add_segment(self, segment: Segment):
+        assert self.corpus is not None, (
+            "The recording must be added to a corpus via Corpus.add_recording() before using Recording.add_segment()."
+        )
         assert isinstance(segment, Segment)
         segment.recording = self
         self._segments[segment.fullname()] = segment

From 151966c8abdead4654040fd9aa957513e9e11879 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 06:32:35 -0400
Subject: [PATCH 18/39] Improve docstring

---
 lib/corpus.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 8101807b9..f305361fe 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -423,7 +423,12 @@ def dump(self, out: TextIO, indentation: str = ""):
 
         out.write("%s</recording>\n" % indentation)
 
-    def get_segment_by_name(self, name: str):
+    def get_segment_by_name(self, name: str) -> Segment:
+        """
+        :param name: Full name of the segment.
+
+        :return: Segment which is identified by the full name specified in :param:`name`.
+        """
         assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'"
         return self._segments[name]
 

From b406ac795472d79e45b2c68c599db0763b1427a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 06:32:43 -0400
Subject: [PATCH 19/39] Add remove_segment call

---
 lib/corpus.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/corpus.py b/lib/corpus.py
index f305361fe..df5ee0178 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -440,6 +440,12 @@ def add_segment(self, segment: Segment):
         segment.recording = self
         self._segments[segment.fullname()] = segment
 
+    def remove_segment(self, segment: Segment):
+        assert segment.fullname() in self._segments, (
+            f"Segment '{segment.fullname()}' was not found in recording '{self.name}'"
+        )
+        del self._segments[segment.fullname()]
+
     def get_segment_mapping(self) -> Dict[str, Segment]:
         """
         :return: Mapping from segment fullnames to actual segments.

From 4d356dfe9ac105458f15d390cb337f69ef51b7a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 06:48:18 -0400
Subject: [PATCH 20/39] Fix recording segments call

---
 lib/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index df5ee0178..cd658ce62 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -302,7 +302,7 @@ def filter_segments(self, filter_function: FilterFunction):
         """
         for rec_full_name, r in self._recordings.items():
             self._recordings[rec_full_name]._segments = {
-                s.fullname(): s for s in r.segments.values() if filter_function(self, r, s)
+                s.fullname(): s for s in r.segments if filter_function(self, r, s)
             }
         for subcorpus_full_name in self._subcorpora():
             self._subcorpora[subcorpus_full_name].filter_segments(filter_function)

From 42e69d204cc30348308dc98fab067d2e5bc07c6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 06:48:33 -0400
Subject: [PATCH 21/39] Fix Recording.segments alls throughout the repo

---
 audio/ffmpeg.py             |  5 +++--
 corpus/data_augmentation.py | 11 +++++------
 corpus/filter.py            | 29 ++++++++++++++++-------------
 corpus/transform.py         | 19 +++++++++----------
 datasets/librispeech.py     |  7 ++-----
 datasets/switchboard.py     |  7 ++-----
 lib/corpus.py               |  4 ++--
 7 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/audio/ffmpeg.py b/audio/ffmpeg.py
index 9809b8eb2..b5ce0331a 100644
--- a/audio/ffmpeg.py
+++ b/audio/ffmpeg.py
@@ -159,8 +159,9 @@ def run_recover_duration(self):
         c.load("temp_corpus.xml.gz")
 
         for r in c.all_recordings():
-            assert len(r.segments) == 1, "needs to be a single segment recording"
-            segment = r.segments[0]
+            recording_segments = list(r.segments)
+            assert len(recording_segments) == 1, "needs to be a single segment recording"
+            segment = recording_segments[0]
             old_duration = segment.end
             assert r.audio is not None
             data, sample_rate = soundfile.read(open(r.audio, "rb"))
diff --git a/corpus/data_augmentation.py b/corpus/data_augmentation.py
index 0be616aad..40d9ced09 100644
--- a/corpus/data_augmentation.py
+++ b/corpus/data_augmentation.py
@@ -134,15 +134,14 @@ def run(self):
                 command = ffmpeg_head + noise_inputs + filter_head + volume_reduction + mixer + filter_tail
                 self.sh(command)
 
-            nr = corpus.Recording()
+            nr = corpus.Recording(corpus=nc)
             nr.name = r.name
-            nr.segments = r.segments
             nr.speaker_name = r.speaker_name
             nr.default_speaker = r.default_speaker
             nr.speakers = r.speakers
             nr.audio = str(self.out_audio_folder) + "/" + reverbed_audio_name
-            nc.add_recording(nr)
-            for s in nr.segments:
+            for s in r.segments:
+                nr.add_segment(s)
                 segment_file_names.append(nc.name + "/" + nr.name + "/" + s.name + "\n")
 
         nc.dump(self.out_corpus.get_path())
@@ -205,13 +204,13 @@ def run(self):
 
             pr = corpus.Recording()
             pr.name = r.name
-            pr.segments = r.segments
             pr.speaker_name = r.speaker_name
             pr.speakers = r.speakers
             pr.default_speaker = r.default_speaker
             pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name
             nc.add_recording(pr)
-            for s in pr.segments:
+            for s in r.segments:
+                pr.add_segment(s)
                 segment_file_names.append(nc.name + "/" + pr.name + "/" + s.name)
                 s.start /= self.speed_factor
                 s.end /= self.speed_factor
diff --git a/corpus/filter.py b/corpus/filter.py
index fc374af17..f518dd1b9 100644
--- a/corpus/filter.py
+++ b/corpus/filter.py
@@ -32,14 +32,11 @@ def _delete_empty_recordings(corpus: corpus.Corpus, removed_recordings_file: str
     :param c: Corpus for which to delete the empty recordings.
     :param removed_recordings_file: File in which to dump all recordings that have been deleted.
     """
-    to_delete = []
-    for rec in corpus.all_recordings():
-        if not rec.segments:
-            to_delete.append(rec)
-
-    corpus.remove_recordings(to_delete)
     with open(removed_recordings_file, "w") as f:
-        f.write("\n".join(rec.fullname() for rec in to_delete))
+        for rec in corpus.all_recordings():
+            if not rec.segments:
+                corpus.remove_recording(rec)
+                f.write(f"{rec.fullname()}\n")
 
 
 class FilterSegmentsByListJob(Job):
@@ -440,10 +437,16 @@ def run(self):
         c.load(tk.uncached_path(self.bliss_corpus))
 
         for rec in c.all_recordings():
-            if self.invert_match:
-                rec.segments = [x for x in rec.segments if x.fullname() not in segments and x.name not in segments]
-            else:
-                rec.segments = [x for x in rec.segments if x.fullname() in segments or x.name in segments]
+            segments_to_delete = []
+            for s in rec.segments:
+                if self.invert_match:
+                    if s.fullname() in segments or s.name in segments:
+                        segments_to_delete.append(s)
+                else:
+                    if s.fullname() not in segments and s.name not in segments:
+                        segments_to_delete.append(s)
+            for s in segments_to_delete:
+                rec.remove_segment(s)
 
         if self.delete_empty_recordings:
             # Remove the recordings without segments due to the filtering.
@@ -527,7 +530,7 @@ def maybe_to_lower(s):
 
         c = corpus.Corpus()
         c.load(self.corpus.get_path())
-        num_segments_per_recording = {r.fullname(): len(r.segments) for r in c.all_recordings()}
+        num_segments_per_recording = {r.fullname(): len(list(r.segments)) for r in c.all_recordings()}
 
         # use var name instead of attribute to avoid problem with name scope
         log_oov_list = self.log_oov_list
@@ -577,7 +580,7 @@ def __call__(self, corpus: corpus.Corpus, recording: corpus.Recording, segment:
             recordings_to_be_removed = []
             for r in c.all_recordings():
                 num_seg = num_segments_per_recording[r.fullname()]
-                new_num_seg = len(r.segments)
+                new_num_seg = len(list(r.segments))
                 if num_seg and (num_seg - new_num_seg) / num_seg > self.recording_oov_tolerance:
                     recordings_to_be_removed.append(r)
 
diff --git a/corpus/transform.py b/corpus/transform.py
index 348ac98df..9605329d3 100644
--- a/corpus/transform.py
+++ b/corpus/transform.py
@@ -70,22 +70,22 @@ def run(self):
             words = [s[1] for s in transcriptions[recording.name]]
 
             if len(words) == 0 and self.remove_empty_segments:
-                recordings_to_delete = recording
+                recordings_to_delete = recording  # TODO: this does nothing.
                 continue
 
             segments_to_delete = []
-            for idx, segment in enumerate(recording.segments):
+            for segment in recording.segments:
                 left_idx = bisect.bisect_left(times, segment.start)
                 right_idx = bisect.bisect_left(times, segment.end)
 
                 if left_idx == right_idx and self.remove_empty_segments:
-                    segments_to_delete.append(idx)
+                    segments_to_delete.append(segment)
                     continue
 
                 segment.orth = " ".join(words[left_idx:right_idx]).replace("&", "&amp;")
 
-            for sidx in reversed(segments_to_delete):
-                del recording.segments[sidx]
+            for segment in reversed(segments_to_delete):
+                recording.remove_segment(segment)
 
         c.dump(self.output_corpus_path.get_path())
 
@@ -223,7 +223,7 @@ def run(self):
                         sm_entry.value = "/".join([c.name, split_name, segment.name])
                         sm.map_entries.append(sm_entry)
 
-                        new_recording_element.segments.append(segment)
+                        new_recording_element.add_segment(segment)
                         segment_count += 1
 
                     # update the time stamp with the recording length and add to ffmpeg merge list
@@ -547,15 +547,14 @@ def run(self):
         nc.speaker_name = c.speaker_name
         # store index of last segment
         for r in c.recordings:
-            sr = corpus.Recording()
+            sr = corpus.Recording(corpus=nc)
             sr.name = r.name
-            sr.segments = r.segments
             sr.speaker_name = r.speaker_name
             sr.speakers = r.speakers
             sr.default_speaker = r.default_speaker
             sr.audio = r.audio
-            nc.add_recording(sr)
-            for s in sr.segments:
+            for s in r.segments:
+                sr.add_segment(s)
                 segment_file_names.append(nc.name + "/" + sr.name + "/" + s.name)
                 s.start += self.shift
 
diff --git a/datasets/librispeech.py b/datasets/librispeech.py
index fca330ecb..2b8840ce5 100644
--- a/datasets/librispeech.py
+++ b/datasets/librispeech.py
@@ -136,22 +136,19 @@ def run(self):
 
         for transcript in self._transcripts:
             name = "{0}-{1}-{2:04d}".format(transcript["speaker_id"], transcript["chapter"], transcript["segment"])
-            recording = corpus.Recording()
+            recording = corpus.Recording(corpus=c)
             recording.name = name
             recording.speaker_name = transcript["speaker_id"]
             recording.audio = "{}/{}.flac".format(transcript["path"], name)
 
             used_speaker_ids.add(transcript["speaker_id"])
 
-            segment = corpus.Segment()
+            segment = corpus.Segment(recording=recording)
             segment.name = name
             segment.start = 0
             segment.end = float("inf")
             segment.orth = transcript["orth"].strip()
 
-            recording.segments.append(segment)
-            c.recordings.append(recording)
-
         for speaker_id, speaker_info in sorted(self._speakers.items()):
             if speaker_id not in used_speaker_ids:
                 continue
diff --git a/datasets/switchboard.py b/datasets/switchboard.py
index d66177509..1361375fa 100644
--- a/datasets/switchboard.py
+++ b/datasets/switchboard.py
@@ -262,7 +262,7 @@ def run(self):
             rec_to_segs.pop("sw02167B")
 
         for rec_name, segs in sorted(rec_to_segs.items()):
-            recording = corpus.Recording()
+            recording = corpus.Recording(corpus=c)
             recording.name = rec_name
             recording.audio = os.path.join(self.audio_dir.get_path(), rec_name + ".wav")
 
@@ -272,7 +272,7 @@ def run(self):
             rec_speaker_id = rec_to_speaker[rec_name]["speaker_id"]
 
             for seg in segs:
-                segment = corpus.Segment()
+                segment = corpus.Segment(recording=recording)
                 segment.name = seg[0]
                 segment.start = float(seg[1])
                 segment.end = float(seg[2])
@@ -281,9 +281,6 @@ def run(self):
                 if len(segment.orth) == 0:
                     continue
 
-                recording.segments.append(segment)
-            c.recordings.append(recording)
-
         # add speakers to corpus
         for speaker_info in rec_to_speaker.values():
             speaker = corpus.Speaker()
diff --git a/lib/corpus.py b/lib/corpus.py
index cd658ce62..fb870a860 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -489,8 +489,8 @@ def __init__(
         self.left_context_orth = left_context_orth
         self.right_context_orth = right_context_orth
         self.speaker_name = speaker_name
-
-        self.recording = recording
+        if recording:
+            recording.add_segment(self)
 
     def full_orth(self) -> str:
         """

From f1f0d73612ee755283fb98ff4fa7ce39b08a6926 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 09:48:12 -0400
Subject: [PATCH 22/39] Add proper setters

---
 lib/corpus.py | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index fb870a860..48b38803e 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -178,34 +178,38 @@ def __init__(self, name: Optional[str] = None):
     @property
     def subcorpora(self) -> Iterable[Corpus]:
         """
-        Read-only property.
-
         :return: Iterable of all top-level subcorpora.
         """
         return self._subcorpora.values()
 
     @subcorpora.setter
-    def subcorpora(self):
-        raise AttributeError(
-            "Corpus.subcorpora is a read-only attribute. "
-            "If you're using old code, please use the current proper API function."
+    def subcorpora(self, value: List[Corpus]):
+        """
+        :param value: List of subcorpora that the recording must hold.
+            The previous subcorpora will be overwritten.
+        """
+        assert isinstance(value, list) and all(isinstance(c, Corpus) for c in value), (
+            f"Can only set Corpus.subcorpora to a list, but tried setting it to {type(value)}."
         )
+        self._subcorpora = {c.fullname(): c for c in value}
 
     @property
     def recordings(self) -> Iterable[Recording]:
         """
-        Read-only property.
-
         :return: Iterable of all top-level recordings.
         """
         return self._recordings.values()
 
     @recordings.setter
-    def recordings(self):
-        raise AttributeError(
-            "Corpus.recordings is a read-only attribute. "
-            "If you're using old code, please use the current proper API function."
+    def recordings(self, value: List[Recording]):
+        """
+        :param value: List of recordings that the corpus must hold.
+            The previous recordings will be overwritten.
+        """
+        assert isinstance(value, list) and all(isinstance(r, Recording) for r in value), (
+            f"Can only set Corpus.recordings to a list, but tried setting it to {type(value)}."
         )
+        self._recordings = {r.fullname(): r for r in value}
 
     def segments(self) -> Iterable[Segment]:
         """
@@ -385,16 +389,20 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corp
     @property
     def segments(self) -> Iterable[Segment]:
         """
-        Read-only property. If one wants to add a segment to this recording, please use :func:`Recording.add_segment`.
+        :return: Iterable of all segments in a recording.
         """
         return self._segments.values()
 
     @segments.setter
-    def segments(self):
-        raise AttributeError(
-            "Recording.segments is a read-only property. "
-            "If you're using old code, please use the current proper API function."
+    def segments(self, value: List[Segment]):
+        """
+        :param value: List of segments that the recording must hold.
+            The previous segments will be overwritten.
+        """
+        assert isinstance(value, list) and all(isinstance(s, Segment) for s in value), (
+            f"Can only set Recording.segments to a list, but tried setting it to {type(value)}."
         )
+        self._recordings = {s.fullname(): s for s in value}
 
     def fullname(self) -> str:
         assert self.corpus is not None, (

From f671320032ab7ecc0cc3f7d1d9d84e940ff93293 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 09:52:08 -0400
Subject: [PATCH 23/39] Take advantage of setter

---
 corpus/filter.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/corpus/filter.py b/corpus/filter.py
index f518dd1b9..5f3cf2d03 100644
--- a/corpus/filter.py
+++ b/corpus/filter.py
@@ -437,16 +437,10 @@ def run(self):
         c.load(tk.uncached_path(self.bliss_corpus))
 
         for rec in c.all_recordings():
-            segments_to_delete = []
-            for s in rec.segments:
-                if self.invert_match:
-                    if s.fullname() in segments or s.name in segments:
-                        segments_to_delete.append(s)
-                else:
-                    if s.fullname() not in segments and s.name not in segments:
-                        segments_to_delete.append(s)
-            for s in segments_to_delete:
-                rec.remove_segment(s)
+            if self.invert_match:
+                rec.segments = [x for x in rec.segments if x.fullname() not in segments and x.name not in segments]
+            else:
+                rec.segments = [x for x in rec.segments if x.fullname() in segments or x.name in segments]
 
         if self.delete_empty_recordings:
             # Remove the recordings without segments due to the filtering.

From 8472e3ef99b2bf90c7f2fa8a63fe79922aba2ec6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 09:52:28 -0400
Subject: [PATCH 24/39] Fix recording call

---
 corpus/data_augmentation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/corpus/data_augmentation.py b/corpus/data_augmentation.py
index 40d9ced09..5b2413472 100644
--- a/corpus/data_augmentation.py
+++ b/corpus/data_augmentation.py
@@ -202,13 +202,12 @@ def run(self):
                 "-ar {base_frequency} '{audio_out}/%s'" % (r.audio, perturbed_audio_name)
             )
 
-            pr = corpus.Recording()
+            pr = corpus.Recording(corpus=nc)
             pr.name = r.name
             pr.speaker_name = r.speaker_name
             pr.speakers = r.speakers
             pr.default_speaker = r.default_speaker
             pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name
-            nc.add_recording(pr)
             for s in r.segments:
                 pr.add_segment(s)
                 segment_file_names.append(nc.name + "/" + pr.name + "/" + s.name)

From cb4856fed999eee88d7093778e3fd47526cb49c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 09:58:45 -0400
Subject: [PATCH 25/39] More fixes

---
 corpus/data_augmentation.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/corpus/data_augmentation.py b/corpus/data_augmentation.py
index 5b2413472..7de1a7820 100644
--- a/corpus/data_augmentation.py
+++ b/corpus/data_augmentation.py
@@ -77,7 +77,8 @@ def run(self):
             r.max_seg_end = max_seg_end
 
         # select noise files for each recording
-        for i, r in enumerate(c.recordings):
+        recording_list = list(c.recordings)
+        for i, r in enumerate(recording_list):
             audio_name = r.audio
             target_length = r.max_seg_end
             reverbed_audio_name = "noised_" + audio_name.split("/")[-1]
@@ -91,11 +92,11 @@ def run(self):
                 noise_audios = []
 
                 while noise_length < target_length:
-                    random_index = rng.randint(0, len(c.recordings) - 1)
+                    random_index = rng.randint(0, len(recording_list) - 1)
                     while random_index == i:
-                        random_index = random.randint(0, len(c.recordings) - 1)
-                    noise_audios.append(c.recordings[random_index])
-                    noise_length += c.recordings[random_index].max_seg_end
+                        random_index = random.randint(0, len(recording_list) - 1)
+                    noise_audios.append(recording_list[random_index])
+                    noise_length += recording_list[random_index].max_seg_end
 
                 # create temp noise file
                 temp_noise_track_file = "/dev/shm/{id}/tmp_concat_%i.wav" % n

From 5ac2ec17eba10abe68fe482390ce6897dc1b96df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 10:24:23 -0400
Subject: [PATCH 26/39] Update include corpus

---
 lib/corpus.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 48b38803e..ed93ab52b 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -86,12 +86,12 @@ def startElement(self, name: str, attrs: Dict[str, str]):
                 print(
                     "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name)
                 )
-            for sc in c.subcorpora.values():
+            for sc in c.subcorpora:
                 sc.parent_corpus = e.parent_corpus
-            for r in c.recordings.values():
+            for r in c.recordings:
                 r.corpus = e
-            e.subcorpora.update(c.subcorpora)
-            e.recordings.update(c.recordings)
+            e._subcorpora.update(c.subcorpora)
+            e._recordings.update(c.recordings)
             e.speakers.update(c.speakers)
         elif name == "recording":
             assert isinstance(e, Corpus), "<recording> may only occur within a <corpus> or <subcorpus> element"

From e388a2258ea8e81bc321a35dc999feb4fe5940b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 10:54:36 -0400
Subject: [PATCH 27/39] Add assertions that element must not exist in internal
 structure when adding it

---
 lib/corpus.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/corpus.py b/lib/corpus.py
index ed93ab52b..68d0d62bf 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -267,16 +267,28 @@ def remove_recordings(self, recordings: List[Recording]):
             self.remove_recording(r)
 
     def add_recording(self, recording: Recording):
+        assert recording.fullname() not in self._recordings, (
+            f"Tried to add recording {recording.fullname()} to corpus {self.fullname()}, "
+            "but the recording is already contained in the corpus."
+        )
         assert isinstance(recording, Recording)
         recording.corpus = self
         self._recordings[recording.fullname()] = recording
 
     def add_subcorpus(self, corpus: Corpus):
+        assert corpus.fullname() not in self._subcorpora, (
+            f"Tried to add subcorpus {corpus.fullname()} to corpus {self.fullname()}, "
+            "but the subcorpus is already contained in the corpus."
+        )
         assert isinstance(corpus, Corpus)
         corpus.parent_corpus = self
         self._subcorpora[corpus.fullname()] = corpus
 
     def add_speaker(self, speaker: Speaker):
+        assert speaker.name not in self.speakers, (
+            f"Tried to add speaker {speaker.name} to corpus {self.fullname()}, "
+            "but the speaker is already contained in the corpus."
+        )
         assert isinstance(speaker, Speaker)
         self.speakers[speaker.name] = speaker
 
@@ -441,6 +453,10 @@ def get_segment_by_name(self, name: str) -> Segment:
         return self._segments[name]
 
     def add_segment(self, segment: Segment):
+        assert segment.name not in self._segments, (
+            f"Tried to add segment {segment.name} to recording {self.fullname()}, "
+            "but the segment is already contained in the recording."
+        )
         assert self.corpus is not None, (
             "The recording must be added to a corpus via Corpus.add_recording() before using Recording.add_segment()."
         )

From e0f947399966e79e86a2df59bb0c369f86e99334 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 15 Jul 2025 11:15:44 -0400
Subject: [PATCH 28/39] Add docstring

---
 lib/corpus.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/corpus.py b/lib/corpus.py
index 68d0d62bf..026e4259c 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -22,6 +22,9 @@
 
 class NamedEntity:
     def __init__(self, name: Optional[str] = None):
+        """
+        :param name: Name of the entity.
+        """
         super().__init__()
         self.name = name
 

From 20ad0ad5818a0e3adb8b434ac96c80b881cd2446 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <31628502+Icemole@users.noreply.github.com>
Date: Wed, 16 Jul 2025 10:57:29 +0200
Subject: [PATCH 29/39] Apply suggestions from code review

Co-authored-by: DanEnergetics <d.mann95.dm@gmail.com>
---
 corpus/filter.py | 2 +-
 lib/corpus.py    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/corpus/filter.py b/corpus/filter.py
index 5f3cf2d03..eaf7a44ba 100644
--- a/corpus/filter.py
+++ b/corpus/filter.py
@@ -574,7 +574,7 @@ def __call__(self, corpus: corpus.Corpus, recording: corpus.Recording, segment:
             recordings_to_be_removed = []
             for r in c.all_recordings():
                 num_seg = num_segments_per_recording[r.fullname()]
-                new_num_seg = len(list(r.segments))
+                new_num_seg = len(r.segments)
                 if num_seg and (num_seg - new_num_seg) / num_seg > self.recording_oov_tolerance:
                     recordings_to_be_removed.append(r)
 
diff --git a/lib/corpus.py b/lib/corpus.py
index 026e4259c..f537ef4a1 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -260,7 +260,7 @@ def top_level_speakers(self) -> Iterable[Speaker]:
         yield from self.speakers.values()
 
     def remove_recording(self, recording: Recording):
-        if recording.name in self._recordings:
+        if recording.fullname() in self._recordings:
             del self._recordings[recording.fullname()]
         for sc in self.subcorpora.values():
             sc.remove_recording(recording)
@@ -417,7 +417,7 @@ def segments(self, value: List[Segment]):
         assert isinstance(value, list) and all(isinstance(s, Segment) for s in value), (
             f"Can only set Recording.segments to a list, but tried setting it to {type(value)}."
         )
-        self._recordings = {s.fullname(): s for s in value}
+        self._segments = {s.fullname(): s for s in value}
 
     def fullname(self) -> str:
         assert self.corpus is not None, (
@@ -456,7 +456,7 @@ def get_segment_by_name(self, name: str) -> Segment:
         return self._segments[name]
 
     def add_segment(self, segment: Segment):
-        assert segment.name not in self._segments, (
+        assert segment.fullname() not in self._segments, (
             f"Tried to add segment {segment.name} to recording {self.fullname()}, "
             "but the segment is already contained in the recording."
         )

From a12a430736da59bdd9fb5387505c7f3bb9a595ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Wed, 16 Jul 2025 05:00:22 -0400
Subject: [PATCH 30/39] Use name instead of full name

---
 lib/corpus.py | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index f537ef4a1..0ba5a48b1 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -194,7 +194,7 @@ def subcorpora(self, value: List[Corpus]):
         assert isinstance(value, list) and all(isinstance(c, Corpus) for c in value), (
             f"Can only set Corpus.subcorpora to a list, but tried setting it to {type(value)}."
         )
-        self._subcorpora = {c.fullname(): c for c in value}
+        self._subcorpora = {c.name: c for c in value}
 
     @property
     def recordings(self) -> Iterable[Recording]:
@@ -212,7 +212,7 @@ def recordings(self, value: List[Recording]):
         assert isinstance(value, list) and all(isinstance(r, Recording) for r in value), (
             f"Can only set Corpus.recordings to a list, but tried setting it to {type(value)}."
         )
-        self._recordings = {r.fullname(): r for r in value}
+        self._recordings = {r.name: r for r in value}
 
     def segments(self) -> Iterable[Segment]:
         """
@@ -260,8 +260,8 @@ def top_level_speakers(self) -> Iterable[Speaker]:
         yield from self.speakers.values()
 
     def remove_recording(self, recording: Recording):
-        if recording.fullname() in self._recordings:
-            del self._recordings[recording.fullname()]
+        if recording.name in self._recordings:
+            del self._recordings[recording.name]
         for sc in self.subcorpora.values():
             sc.remove_recording(recording)
 
@@ -270,22 +270,22 @@ def remove_recordings(self, recordings: List[Recording]):
             self.remove_recording(r)
 
     def add_recording(self, recording: Recording):
-        assert recording.fullname() not in self._recordings, (
-            f"Tried to add recording {recording.fullname()} to corpus {self.fullname()}, "
+        assert recording.name not in self._recordings, (
+            f"Tried to add recording {recording.name} to corpus {self.fullname()}, "
             "but the recording is already contained in the corpus."
         )
         assert isinstance(recording, Recording)
         recording.corpus = self
-        self._recordings[recording.fullname()] = recording
+        self._recordings[recording.name] = recording
 
     def add_subcorpus(self, corpus: Corpus):
-        assert corpus.fullname() not in self._subcorpora, (
-            f"Tried to add subcorpus {corpus.fullname()} to corpus {self.fullname()}, "
+        assert corpus.name not in self._subcorpora, (
+            f"Tried to add subcorpus {corpus.name} to corpus {self.fullname()}, "
             "but the subcorpus is already contained in the corpus."
         )
         assert isinstance(corpus, Corpus)
         corpus.parent_corpus = self
-        self._subcorpora[corpus.fullname()] = corpus
+        self._subcorpora[corpus.name] = corpus
 
     def add_speaker(self, speaker: Speaker):
         assert speaker.name not in self.speakers, (
@@ -320,9 +320,7 @@ def filter_segments(self, filter_function: FilterFunction):
         :param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept
         """
         for rec_full_name, r in self._recordings.items():
-            self._recordings[rec_full_name]._segments = {
-                s.fullname(): s for s in r.segments if filter_function(self, r, s)
-            }
+            self._recordings[rec_full_name]._segments = {s.name: s for s in r.segments if filter_function(self, r, s)}
         for subcorpus_full_name in self._subcorpora():
             self._subcorpora[subcorpus_full_name].filter_segments(filter_function)
 
@@ -417,7 +415,7 @@ def segments(self, value: List[Segment]):
         assert isinstance(value, list) and all(isinstance(s, Segment) for s in value), (
             f"Can only set Recording.segments to a list, but tried setting it to {type(value)}."
         )
-        self._segments = {s.fullname(): s for s in value}
+        self._segments = {s.name: s for s in value}
 
     def fullname(self) -> str:
         assert self.corpus is not None, (
@@ -456,7 +454,7 @@ def get_segment_by_name(self, name: str) -> Segment:
         return self._segments[name]
 
     def add_segment(self, segment: Segment):
-        assert segment.fullname() not in self._segments, (
+        assert segment.name not in self._segments, (
             f"Tried to add segment {segment.name} to recording {self.fullname()}, "
             "but the segment is already contained in the recording."
         )
@@ -465,13 +463,11 @@ def add_segment(self, segment: Segment):
         )
         assert isinstance(segment, Segment)
         segment.recording = self
-        self._segments[segment.fullname()] = segment
+        self._segments[segment.name] = segment
 
     def remove_segment(self, segment: Segment):
-        assert segment.fullname() in self._segments, (
-            f"Segment '{segment.fullname()}' was not found in recording '{self.name}'"
-        )
-        del self._segments[segment.fullname()]
+        assert segment.name in self._segments, f"Segment '{segment.name}' was not found in recording '{self.name}'"
+        del self._segments[segment.name]
 
     def get_segment_mapping(self) -> Dict[str, Segment]:
         """

From c03ec82a2291a7dde2c5be806f9e057923eac62c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Wed, 16 Jul 2025 05:02:24 -0400
Subject: [PATCH 31/39] Remove redundant conversion to list

---
 corpus/filter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/corpus/filter.py b/corpus/filter.py
index eaf7a44ba..054bd49a1 100644
--- a/corpus/filter.py
+++ b/corpus/filter.py
@@ -524,7 +524,7 @@ def maybe_to_lower(s):
 
         c = corpus.Corpus()
         c.load(self.corpus.get_path())
-        num_segments_per_recording = {r.fullname(): len(list(r.segments)) for r in c.all_recordings()}
+        num_segments_per_recording = {r.fullname(): len(r.segments) for r in c.all_recordings()}
 
         # use var name instead of attribute to avoid problem with name scope
         log_oov_list = self.log_oov_list

From 9278b1e8b1907f50a5f0d50808287ec62974d167 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Wed, 16 Jul 2025 05:13:31 -0400
Subject: [PATCH 32/39] Improve retrieval of segments from corpus/recording

Allow searching for base name in recording as well, and search in subcorpora when segment not found in main corpus
---
 lib/corpus.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 0ba5a48b1..9dc33547c 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -234,11 +234,15 @@ def get_segment_by_name(self, name: str) -> Segment:
         """
         :return: the segment specified by its full name
         """
-        recording_name, _ = name.rsplit("/", 1)
-        assert recording_name in self._recordings, (
-            f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus"
-        )
-        return self._recordings[recording_name].get_segment_by_name(name)
+        recording_name, segment_name = name.rsplit("/", maxsplit=1)
+        if recording_name in self._recordings:
+            return self._recordings[recording_name].get_segment_by_name(segment_name)
+        else:
+            subcorpus_name, segment_name = name.split("/", maxsplit=1)
+            assert subcorpus_name in self._subcorpora, (
+                f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus"
+            )
+            return self._subcorpora[subcorpus_name].get_segment_by_name(segment_name)
 
     def all_recordings(self) -> Iterable[Recording]:
         yield from self.recordings
@@ -446,12 +450,13 @@ def dump(self, out: TextIO, indentation: str = ""):
 
     def get_segment_by_name(self, name: str) -> Segment:
         """
-        :param name: Full name of the segment.
+        :param name: Name or full name of the segment.
 
         :return: Segment which is identified by the full name specified in :param:`name`.
         """
-        assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'"
-        return self._segments[name]
+        _, segment_name = name.rsplit("/", maxsplit=1)
+        assert segment_name in self._segments, f"Segment '{segment_name}' was not found in recording '{self.name}'"
+        return self._segments[segment_name]
 
     def add_segment(self, segment: Segment):
         assert segment.name not in self._segments, (

From fdc7315680d44d552eac923ba5692e5b7ddb4fd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 11 Aug 2025 05:56:11 -0400
Subject: [PATCH 33/39] Use Corpus API

---
 lib/corpus.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 9dc33547c..7ea89f8d3 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -90,11 +90,9 @@ def startElement(self, name: str, attrs: Dict[str, str]):
                     "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name)
                 )
             for sc in c.subcorpora:
-                sc.parent_corpus = e.parent_corpus
+                e.add_subcorpus(sc)
             for r in c.recordings:
-                r.corpus = e
-            e._subcorpora.update(c.subcorpora)
-            e._recordings.update(c.recordings)
+                e.add_recording(r)
             e.speakers.update(c.speakers)
         elif name == "recording":
             assert isinstance(e, Corpus), "<recording> may only occur within a <corpus> or <subcorpus> element"

From be48a260f8c99bad88ae577ff16c139906d4cb10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 11 Aug 2025 06:15:12 -0400
Subject: [PATCH 34/39] Add attributes/types to base class

---
 lib/corpus.py | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 7ea89f8d3..3a999feb8 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -165,16 +165,20 @@ class Corpus(NamedEntity, CorpusSection):
     attribute is set. Corpora with include statements can be read but are written back as a single file.
     """
 
+    name: Optional[str]
+    parent_corpus: Optional[Corpus]
+    _recordings: Dict[str, Recording]  # recording-name: Recording
+    _subcorpora: Dict[str, Corpus]  # corpus-name: Corpus
+
     def __init__(self, name: Optional[str] = None):
         """
         :param name: Corpus name.
         """
         super().__init__(name=name)
 
-        self.parent_corpus: Optional[Corpus] = None
-
-        self._subcorpora: Dict[str, Corpus] = {}  # full-name: Corpus
-        self._recordings: Dict[str, Recording] = {}  # full-name: Recording
+        self.parent_corpus = None
+        self._subcorpora = {}
+        self._recordings = {}
 
     @property
     def subcorpora(self) -> Iterable[Corpus]:
@@ -390,6 +394,15 @@ def __repr__(self):
 
 
 class Recording(NamedEntity, CorpusSection):
+    """
+    This class represents a recording in Bliss format.
+    """
+
+    name: Optional[str]
+    audio: Optional[str]
+    _segments: Dict[str, Segment]
+    corpus: Optional[Corpus]
+
     def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corpus: Optional[Corpus] = None):
         """
         :param name: Recording name.
@@ -397,9 +410,9 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corp
         super().__init__(name=name)
 
         self.audio = audio
+        self._segments = {}
         if corpus:
             corpus.add_recording(self)
-        self._segments: Dict[str, Segment] = {}
 
     @property
     def segments(self) -> Iterable[Segment]:
@@ -483,6 +496,20 @@ def __repr__(self):
 
 
 class Segment(NamedEntity):
+    """
+    This class represents a segment in Bliss format.
+    """
+
+    name: Optional[str]
+    start: Optional[float]
+    end: Optional[float]
+    track: Optional[int]
+    orth: Optional[str]
+    left_context_orth: Optional[str]
+    right_context_orth: Optional[str]
+    speaker_name: Optional[str]
+    recording: Optional[Recording]
+
     def __init__(
         self,
         *,

From f7b42f8e431ad19ae40a3f212b19267591b0aaec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 11 Aug 2025 06:25:49 -0400
Subject: [PATCH 35/39] Various improvements to user class init

Also add parent_corpus parameter to corpus init, add else clauses if related object is not provided
---
 lib/corpus.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 3a999feb8..eb7170344 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -170,16 +170,20 @@ class Corpus(NamedEntity, CorpusSection):
     _recordings: Dict[str, Recording]  # recording-name: Recording
     _subcorpora: Dict[str, Corpus]  # corpus-name: Corpus
 
-    def __init__(self, name: Optional[str] = None):
+    def __init__(self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus] = None):
         """
         :param name: Corpus name.
         """
         super().__init__(name=name)
 
-        self.parent_corpus = None
         self._subcorpora = {}
         self._recordings = {}
 
+        if parent_corpus:
+            self.parent_corpus.add_subcorpus(self)
+        else:
+            self.parent_corpus = None
+
     @property
     def subcorpora(self) -> Iterable[Corpus]:
         """
@@ -403,7 +407,7 @@ class Recording(NamedEntity, CorpusSection):
     _segments: Dict[str, Segment]
     corpus: Optional[Corpus]
 
-    def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corpus: Optional[Corpus] = None):
+    def __init__(self, name: Optional[str] = None, *, audio: Optional[str] = None, corpus: Optional[Corpus] = None):
         """
         :param name: Recording name.
         """
@@ -411,8 +415,11 @@ def __init__(self, name: Optional[str] = None, audio: Optional[str] = None, corp
 
         self.audio = audio
         self._segments = {}
+
         if corpus:
             corpus.add_recording(self)
+        else:
+            self.corpus = None
 
     @property
     def segments(self) -> Iterable[Segment]:
@@ -512,8 +519,8 @@ class Segment(NamedEntity):
 
     def __init__(
         self,
-        *,
         name: Optional[str] = None,
+        *
         start: float = 0.0,
         end: float = 0.0,
         track: Optional[int] = None,
@@ -542,8 +549,11 @@ def __init__(
         self.left_context_orth = left_context_orth
         self.right_context_orth = right_context_orth
         self.speaker_name = speaker_name
+
         if recording:
             recording.add_segment(self)
+        else:
+            self.recording = None
 
     def full_orth(self) -> str:
         """

From a160dc6abee81783c9ca891cbaa5b220b69f4d18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 11 Aug 2025 06:27:03 -0400
Subject: [PATCH 36/39] Remove unneeded assertion

---
 lib/corpus.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index eb7170344..9f5fe9925 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -481,9 +481,6 @@ def add_segment(self, segment: Segment):
             f"Tried to add segment {segment.name} to recording {self.fullname()}, "
             "but the segment is already contained in the recording."
         )
-        assert self.corpus is not None, (
-            "The recording must be added to a corpus via Corpus.add_recording() before using Recording.add_segment()."
-        )
         assert isinstance(segment, Segment)
         segment.recording = self
         self._segments[segment.name] = segment

From 6a955e3d45469e6f2dce35a3cdd251b7ad3634a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 11 Aug 2025 06:29:25 -0400
Subject: [PATCH 37/39] Add comma

---
 lib/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 9f5fe9925..477b3f2e7 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -517,7 +517,7 @@ class Segment(NamedEntity):
     def __init__(
         self,
         name: Optional[str] = None,
-        *
+        *,
         start: float = 0.0,
         end: float = 0.0,
         track: Optional[int] = None,

From eda8ab2cc2b8252862776e8acd946c6c9f50cc94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 11 Aug 2025 06:50:54 -0400
Subject: [PATCH 38/39] Improve docstring

---
 lib/corpus.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 477b3f2e7..5fbb282b6 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -170,9 +170,13 @@ class Corpus(NamedEntity, CorpusSection):
     _recordings: Dict[str, Recording]  # recording-name: Recording
     _subcorpora: Dict[str, Corpus]  # corpus-name: Corpus
 
-    def __init__(self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus] = None):
+    def __init__(
+        self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus] = None, load_from: Optional[str] = None
+    ):
         """
         :param name: Corpus name.
+        :param parent_corpus: If provided, `self` will be directly added to :param:`parent_corpus` as a subcorpus.
+        :param load_from: If provided, :func:`load` will be directly run with this parameter.
         """
         super().__init__(name=name)
 
@@ -184,6 +188,9 @@ def __init__(self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus
         else:
             self.parent_corpus = None
 
+        if load_from:
+            self.load(load_from)
+
     @property
     def subcorpora(self) -> Iterable[Corpus]:
         """
@@ -410,6 +417,8 @@ class Recording(NamedEntity, CorpusSection):
     def __init__(self, name: Optional[str] = None, *, audio: Optional[str] = None, corpus: Optional[Corpus] = None):
         """
         :param name: Recording name.
+        :param audio: Actual path to the audio file which contains the playable media.
+        :param corpus: If provided, `self` will be directly added to the recordings in :param:`corpus`.
         """
         super().__init__(name=name)
 
@@ -528,6 +537,7 @@ def __init__(
         recording: Optional[Recording] = None,
     ):
         """
+        :param name: Segment name.
         :param start: Segment start.
         :param end: Segment end.
         :param track: Segment track/channel.
@@ -535,7 +545,7 @@ def __init__(
         :param left_context_orth: Optional left context when aligning (specific for RASR alignment).
         :param right_context_orth: Optional right context when aligning (specific for RASR alignment).
         :param speaker_name: Speaker name.
-        :param recording: Recording in which the segment is embedded.
+        :param recording: If provided, `self` will be directly added to the segments in :param:`recording`.
         """
         super().__init__(name=name)
 

From a22e8bfbf4bc86b64085caa35fa00d8a8a00fb8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Tue, 12 Aug 2025 05:00:56 -0400
Subject: [PATCH 39/39] Work

Better code on XML parser, docstring improvements, fixes on get_*_by_name through testing
---
 lib/corpus.py | 63 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 5fbb282b6..61b73c09a 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -82,9 +82,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
             self.elements.append(subcorpus)
         elif name == "include":
             assert isinstance(e, Corpus), "<include> may only occur within a <corpus> or <subcorpus> element"
-            path = os.path.join(os.path.dirname(self.path), attrs["file"])
-            c = Corpus()
-            c.load(path)
+            c = Corpus(load_from=os.path.join(os.path.dirname(self.path), attrs["file"]))
             if c.name != e.name:
                 print(
                     "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name)
@@ -96,8 +94,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
             e.speakers.update(c.speakers)
         elif name == "recording":
             assert isinstance(e, Corpus), "<recording> may only occur within a <corpus> or <subcorpus> element"
-            rec = Recording(name=attrs["name"], audio=attrs["audio"])
-            e.add_recording(rec)
+            rec = Recording(name=attrs["name"], audio=attrs["audio"], corpus=e)
             self.elements.append(rec)
         elif name == "segment":
             assert isinstance(e, Recording), "<segment> may only occur within a <recording> element"
@@ -175,8 +172,8 @@ def __init__(
     ):
         """
         :param name: Corpus name.
-        :param parent_corpus: If provided, `self` will be directly added to :param:`parent_corpus` as a subcorpus.
-        :param load_from: If provided, :func:`load` will be directly run with this parameter.
+        :param parent_corpus: If provided, `self` will be directly linked as a subcorpus of :param:`parent_corpus`.
+        :param load_from: If provided, :func:`Corpus.load` will be directly run with this parameter.
         """
         super().__init__(name=name)
 
@@ -238,24 +235,47 @@ def segments(self) -> Iterable[Segment]:
 
     def get_recording_by_name(self, name: str) -> Recording:
         """
-        :return: the recording specified by its full name
+        :return: the recording specified by its name relative to `self`.
         """
-        assert name in self._recordings, f"Recording '{name}' was not found in corpus"
-        return self._recordings[name]
+        if "/" not in name:
+            assert name in self._recordings, f"Recording '{name}' was not found in corpus."
+            return self._recordings[name]
+        else:
+            subcorpus_name, recording_relative_name = name.split("/", maxsplit=1)
+            return self.get_subcorpus_by_name(subcorpus_name).get_recording_by_name(recording_relative_name)
+
+    def get_subcorpus_by_name(self, name: str) -> Corpus:
+        """
+        :return: The corpus specified by its name relative to `self`.
+        """
+        if "/" not in name:
+            assert name in self._subcorpora, f"Subcorpus '{name}' was not found in corpus."
+            return self._subcorpora[name]
+        else:
+            _, subcorpus_relative_name = name.split("/", maxsplit=1)
+            return self.get_subcorpus_by_name(subcorpus_relative_name)
 
     def get_segment_by_name(self, name: str) -> Segment:
         """
-        :return: the segment specified by its full name
+        :param name: Segment name relative to the corpus.
+            Note that it must be at least two levels deep, to also include the recording name.
+            Example: `my_recording/my_segment`.
+        :return: the segment specified by its name relative to `self`.
         """
-        recording_name, segment_name = name.rsplit("/", maxsplit=1)
+        assert "/" in name, (
+            "When running Corpus.get_segment_by_name(), at least two levels of depth 'recording/segment' "
+            "must be provided, separated with '/'."
+        )
+        recording_name, segment_name = name.split("/", maxsplit=1)
         if recording_name in self._recordings:
-            return self._recordings[recording_name].get_segment_by_name(segment_name)
+            return self.get_recording_by_name(recording_name).get_segment_by_name(segment_name)
         else:
-            subcorpus_name, segment_name = name.split("/", maxsplit=1)
+            # The first part is the subcorpus, and the second is the rest of the segment.
+            subcorpus_name, segment_relative_name = recording_name, segment_name
             assert subcorpus_name in self._subcorpora, (
-                f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus"
+                f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus."
             )
-            return self._subcorpora[subcorpus_name].get_segment_by_name(segment_name)
+            return self.get_subcorpus_by_name(subcorpus_name).get_segment_by_name(segment_relative_name)
 
     def all_recordings(self) -> Iterable[Recording]:
         yield from self.recordings
@@ -418,7 +438,7 @@ def __init__(self, name: Optional[str] = None, *, audio: Optional[str] = None, c
         """
         :param name: Recording name.
         :param audio: Actual path to the audio file which contains the playable media.
-        :param corpus: If provided, `self` will be directly added to the recordings in :param:`corpus`.
+        :param corpus: If provided, `self` will be directly linked as a recording of :param:`corpus`.
         """
         super().__init__(name=name)
 
@@ -481,9 +501,12 @@ def get_segment_by_name(self, name: str) -> Segment:
 
         :return: Segment which is identified by the full name specified in :param:`name`.
         """
-        _, segment_name = name.rsplit("/", maxsplit=1)
-        assert segment_name in self._segments, f"Segment '{segment_name}' was not found in recording '{self.name}'"
-        return self._segments[segment_name]
+        assert "/" not in name, (
+            "Depth levels 'recording/segment' are not supported for Recording.get_segment_by_name(). "
+            "Use Corpus.get_segment_by_name() instead."
+        )
+        assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'"
+        return self._segments[name]
 
     def add_segment(self, segment: Segment):
         assert segment.name not in self._segments, (