Skip to content
Open
Changes from 3 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
81cec26
Convert corpus structure to dict
Icemole Jul 10, 2025
3ba9608
Convert recording structure to dict
Icemole Jul 10, 2025
b066017
Fix `Corpus.segments()` call
Icemole Jul 10, 2025
16c7271
Use `rsplit` instead of splitting and concatenating back
Icemole Jul 10, 2025
28ce889
Add recording at the beginning
Icemole Jul 14, 2025
14247a5
Fix
Icemole Jul 14, 2025
4cd832a
Add name to NamedEntity/Corpus/Recording/Segment init
Icemole Jul 14, 2025
dc8b4e4
Use newly declared parameters in init
Icemole Jul 14, 2025
9d23872
Directly copy self segments
Icemole Jul 15, 2025
a673f83
Better init
Icemole Jul 15, 2025
1ebeb7d
Corpus: add subcorpora, recordings properties as read only
Icemole Jul 15, 2025
1e678e8
Update filter segments function
Icemole Jul 15, 2025
de59bec
Corpus: add subcorpora, recordings as properties (2)
Icemole Jul 15, 2025
b6003a3
Recording: add segments as property
Icemole Jul 15, 2025
a1ea859
Segment: add assertion in fullname
Icemole Jul 15, 2025
6d41c46
Always return iterables
Icemole Jul 15, 2025
d5fa7a9
Set explicit read only properties
Icemole Jul 15, 2025
151966c
Improve docstring
Icemole Jul 15, 2025
b406ac7
Add remove_segment call
Icemole Jul 15, 2025
4d356df
Fix recording segments call
Icemole Jul 15, 2025
42e69d2
Fix Recording.segments alls throughout the repo
Icemole Jul 15, 2025
f1f0d73
Add proper setters
Icemole Jul 15, 2025
f671320
Take advantage of setter
Icemole Jul 15, 2025
8472e3e
Fix recording call
Icemole Jul 15, 2025
cb4856f
More fixes
Icemole Jul 15, 2025
5ac2ec1
Update include corpus
Icemole Jul 15, 2025
e388a22
Add assertions that element must not exist in internal structure when…
Icemole Jul 15, 2025
e0f9473
Add docstring
Icemole Jul 15, 2025
20ad0ad
Apply suggestions from code review
Icemole Jul 16, 2025
a12a430
Use name instead of full name
Icemole Jul 16, 2025
c03ec82
Remove redundant conversion to list
Icemole Jul 16, 2025
9278b1e
Improve retrieval of segments from corpus/recording
Icemole Jul 16, 2025
fdc7315
Use Corpus API
Icemole Aug 11, 2025
be48a26
Add attributes/types to base class
Icemole Aug 11, 2025
f7b42f8
Various improvements to user class init
Icemole Aug 11, 2025
a160dc6
Remove unneeded assertion
Icemole Aug 11, 2025
6a955e3
Add comma
Icemole Aug 11, 2025
eda8ab2
Improve docstring
Icemole Aug 11, 2025
a22e8bf
Work
Icemole Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 48 additions & 53 deletions lib/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class CorpusParser(sax.handler.ContentHandler):
"""
This classes methods are called by the sax-parser whenever it encounters an event in the xml-file
(tags/characters/namespaces/...). It uses a stack of elements to remember the part of the corpus that
is currently beeing read.
is currently being read.
"""

def __init__(self, corpus: Corpus, path: str, *, reformat_orth: bool = True):
Expand Down Expand Up @@ -77,7 +77,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
subcorpus = Corpus()
subcorpus.name = attrs["name"]
subcorpus.parent_corpus = e
e.subcorpora.append(subcorpus)
e.subcorpora[subcorpus.name] = subcorpus
self.elements.append(subcorpus)
elif name == "include":
assert isinstance(e, Corpus), "<include> may only occur within a <corpus> or <subcorpus> element"
Expand All @@ -88,12 +88,12 @@ def startElement(self, name: str, attrs: Dict[str, str]):
print(
"Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name)
)
for sc in c.subcorpora:
for sc in c.subcorpora.values():
sc.parent_corpus = e.parent_corpus
for r in c.recordings:
for r in c.recordings.values():
r.corpus = e
e.subcorpora.extend(c.subcorpora)
e.recordings.extend(c.recordings)
e.subcorpora.update(c.subcorpora)
e.recordings.update(c.recordings)
e.speakers.update(c.speakers)
elif name == "recording":
assert isinstance(e, Corpus), "<recording> may only occur within a <corpus> or <subcorpus> element"
Expand Down Expand Up @@ -173,85 +173,75 @@ def __init__(self):

self.parent_corpus: Optional[Corpus] = None

self.subcorpora: List[Corpus] = []
self.recordings: List[Recording] = []
self.subcorpora: Dict[str, Corpus] = {} # full-name: Corpus
self.recordings: Dict[str, Recording] = {} # full-name: Recording

def segments(self) -> Iterable[Segment]:
"""
:return: an iterator over all segments within the corpus
"""
for r in self.recordings:
yield from r.segments
for sc in self.subcorpora:
for r in self.recordings.values():
yield from r.segments.values()
for sc in self.subcorpora.values():
yield from sc.segments()

def get_recording_by_name(self, name: str) -> Recording:
"""
:return: the recording specified by its name
:return: the recording specified by its full name
"""
for rec in self.all_recordings():
if rec.fullname() == name:
return rec
assert False, f"Recording '{name}' was not found in corpus"
assert name in self.recordings, f"Recording '{name}' was not found in corpus"

return self.recordings[name]

def get_segment_by_name(self, name: str) -> Segment:
"""
:return: the segment specified by its name
:return: the segment specified by its full name
"""
for seg in self.segments():
if seg.fullname() == name:
return seg
assert False, f"Segment '{name}' was not found in corpus"
recording_name = "/".join(name.split("/")[:-1])
assert recording_name in self.recordings, (
f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus"
)

return self.recordings[recording_name].get_segment_by_name(name)

def all_recordings(self) -> Iterable[Recording]:
yield from self.recordings
for sc in self.subcorpora:
yield from self.recordings.values()
for sc in self.subcorpora.values():
yield from sc.all_recordings()

def all_speakers(self) -> Iterable[Speaker]:
yield from self.speakers.values()
for sc in self.subcorpora:
for sc in self.subcorpora.values():
yield from sc.all_speakers()

def top_level_recordings(self) -> Iterable[Recording]:
yield from self.recordings
yield from self.recordings.values()

def top_level_subcorpora(self) -> Iterable[Corpus]:
yield from self.subcorpora
yield from self.subcorpora.values()

def top_level_speakers(self) -> Iterable[Speaker]:
yield from self.speakers.values()

def remove_recording(self, recording: Recording):
to_delete = []
for idx, r in enumerate(self.recordings):
if r is recording or r == recording or r.name == recording:
to_delete.append(idx)
for idx in reversed(to_delete):
del self.recordings[idx]
for sc in self.subcorpora:
if recording.name in self.recordings:
del self.recordings[recording.fullname()]
for sc in self.subcorpora.values():
sc.remove_recording(recording)

def remove_recordings(self, recordings: List[Recording]):
recording_fullnames = {recording.fullname() for recording in recordings}
to_delete = []
for idx, r in enumerate(self.recordings):
if r.fullname() in recording_fullnames:
to_delete.append(idx)
for idx in reversed(to_delete):
del self.recordings[idx]
for sc in self.subcorpora:
sc.remove_recordings(recordings)
for r in recordings:
self.remove_recording(r)

def add_recording(self, recording: Recording):
assert isinstance(recording, Recording)
recording.corpus = self
self.recordings.append(recording)
self.recordings[recording.fullname()] = recording

def add_subcorpus(self, corpus: Corpus):
assert isinstance(corpus, Corpus)
corpus.parent_corpus = self
self.subcorpora.append(corpus)
self.subcorpora[corpus.fullname()] = corpus

def add_speaker(self, speaker: Speaker):
assert isinstance(speaker, Speaker)
Expand Down Expand Up @@ -281,9 +271,9 @@ def filter_segments(self, filter_function: FilterFunction):
filter all segments (including in subcorpora) using filter_function
:param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept
"""
for r in self.recordings:
r.segments = [s for s in r.segments if filter_function(self, r, s)]
for sc in self.subcorpora:
for r in self.recordings.values():
r.segments = {s.fullname(): s for s in r.segments.values() if filter_function(self, r, s)}
for sc in self.subcorpora.values():
sc.filter_segments(filter_function)

def load(self, path: str, *, reformat_orth: bool = True):
Expand Down Expand Up @@ -322,10 +312,10 @@ def _dump_internal(self, out: TextIO, indentation: str = ""):
if self.speaker_name is not None:
out.write('%s <speaker name="%s"/>\n' % (indentation, self.speaker_name))

for r in self.recordings:
for r in self.recordings.values():
r.dump(out, indentation + " ")

for sc in self.subcorpora:
for sc in self.subcorpora.values():
sc._dump_internal(out, indentation + " ")

if self.parent_corpus is None:
Expand Down Expand Up @@ -354,7 +344,7 @@ def __init__(self):
super().__init__()
self.audio: Optional[str] = None
self.corpus: Optional[Corpus] = None
self.segments: List[Segment] = []
self.segments: Dict[str, Segment] = {}

def fullname(self) -> str:
return self.corpus.fullname() + "/" + self.name
Expand All @@ -375,21 +365,26 @@ def dump(self, out: TextIO, indentation: str = ""):
if self.speaker_name is not None:
out.write('%s <speaker name="%s"/>\n' % (indentation, self.speaker_name))

for s in self.segments:
for s in self.segments.values():
s.dump(out, indentation + " ")

out.write("%s</recording>\n" % indentation)

def get_segment_by_name(self, name: str):
assert name in self.segments, f"Segment '{name}' was not found in recording '{self.name}'"

return self.segments[name]

def add_segment(self, segment: Segment):
assert isinstance(segment, Segment)
segment.recording = self
self.segments.append(segment)
self.segments[segment.fullname()] = segment

def get_segment_mapping(self) -> Dict[str, Segment]:
"""
:return: Mapping from segment fullnames to actual segments.
"""
return {seg.fullname(): seg for seg in self.segments}
return {seg.fullname(): seg for seg in self.segments.values()}

def __repr__(self):
return f"<{self.__class__.__name__} {self.fullname()}>"
Expand Down
Loading