Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
81cec26
Convert corpus structure to dict
Icemole Jul 10, 2025
3ba9608
Convert recording structure to dict
Icemole Jul 10, 2025
b066017
Fix `Corpus.segments()` call
Icemole Jul 10, 2025
16c7271
Use `rsplit` instead of splitting and concatenating back
Icemole Jul 10, 2025
28ce889
Add recording at the beginning
Icemole Jul 14, 2025
14247a5
Fix
Icemole Jul 14, 2025
4cd832a
Add name to NamedEntity/Corpus/Recording/Segment init
Icemole Jul 14, 2025
dc8b4e4
Use newly declared parameters in init
Icemole Jul 14, 2025
9d23872
Directly copy self segments
Icemole Jul 15, 2025
a673f83
Better init
Icemole Jul 15, 2025
1ebeb7d
Corpus: add subcorpora, recordings properties as read only
Icemole Jul 15, 2025
1e678e8
Update filter segments function
Icemole Jul 15, 2025
de59bec
Corpus: add subcorpora, recordings as properties (2)
Icemole Jul 15, 2025
b6003a3
Recording: add segments as property
Icemole Jul 15, 2025
a1ea859
Segment: add assertion in fullname
Icemole Jul 15, 2025
6d41c46
Always return iterables
Icemole Jul 15, 2025
d5fa7a9
Set explicit read only properties
Icemole Jul 15, 2025
151966c
Improve docstring
Icemole Jul 15, 2025
b406ac7
Add remove_segment call
Icemole Jul 15, 2025
4d356df
Fix recording segments call
Icemole Jul 15, 2025
42e69d2
Fix Recording.segments alls throughout the repo
Icemole Jul 15, 2025
f1f0d73
Add proper setters
Icemole Jul 15, 2025
f671320
Take advantage of setter
Icemole Jul 15, 2025
8472e3e
Fix recording call
Icemole Jul 15, 2025
cb4856f
More fixes
Icemole Jul 15, 2025
5ac2ec1
Update include corpus
Icemole Jul 15, 2025
e388a22
Add assertions that element must not exist in internal structure when…
Icemole Jul 15, 2025
e0f9473
Add docstring
Icemole Jul 15, 2025
20ad0ad
Apply suggestions from code review
Icemole Jul 16, 2025
a12a430
Use name instead of full name
Icemole Jul 16, 2025
c03ec82
Remove redundant conversion to list
Icemole Jul 16, 2025
9278b1e
Improve retrieval of segments from corpus/recording
Icemole Jul 16, 2025
fdc7315
Use Corpus API
Icemole Aug 11, 2025
be48a26
Add attributes/types to base class
Icemole Aug 11, 2025
f7b42f8
Various improvements to user class init
Icemole Aug 11, 2025
a160dc6
Remove unneeded assertion
Icemole Aug 11, 2025
6a955e3
Add comma
Icemole Aug 11, 2025
eda8ab2
Improve docstring
Icemole Aug 11, 2025
a22e8bf
Work
Icemole Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions audio/ffmpeg.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,9 @@ def run_recover_duration(self):
c.load("temp_corpus.xml.gz")

for r in c.all_recordings():
assert len(r.segments) == 1, "needs to be a single segment recording"
segment = r.segments[0]
recording_segments = list(r.segments)
assert len(recording_segments) == 1, "needs to be a single segment recording"
segment = recording_segments[0]
old_duration = segment.end
assert r.audio is not None
data, sample_rate = soundfile.read(open(r.audio, "rb"))
Expand Down
25 changes: 12 additions & 13 deletions corpus/data_augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ def run(self):
r.max_seg_end = max_seg_end

# select noise files for each recording
for i, r in enumerate(c.recordings):
recording_list = list(c.recordings)
for i, r in enumerate(recording_list):
audio_name = r.audio
target_length = r.max_seg_end
reverbed_audio_name = "noised_" + audio_name.split("/")[-1]
Expand All @@ -91,11 +92,11 @@ def run(self):
noise_audios = []

while noise_length < target_length:
random_index = rng.randint(0, len(c.recordings) - 1)
random_index = rng.randint(0, len(recording_list) - 1)
while random_index == i:
random_index = random.randint(0, len(c.recordings) - 1)
noise_audios.append(c.recordings[random_index])
noise_length += c.recordings[random_index].max_seg_end
random_index = random.randint(0, len(recording_list) - 1)
noise_audios.append(recording_list[random_index])
noise_length += recording_list[random_index].max_seg_end

# create temp noise file
temp_noise_track_file = "/dev/shm/{id}/tmp_concat_%i.wav" % n
Expand Down Expand Up @@ -134,15 +135,14 @@ def run(self):
command = ffmpeg_head + noise_inputs + filter_head + volume_reduction + mixer + filter_tail
self.sh(command)

nr = corpus.Recording()
nr = corpus.Recording(corpus=nc)
nr.name = r.name
nr.segments = r.segments
nr.speaker_name = r.speaker_name
nr.default_speaker = r.default_speaker
nr.speakers = r.speakers
nr.audio = str(self.out_audio_folder) + "/" + reverbed_audio_name
nc.add_recording(nr)
for s in nr.segments:
for s in r.segments:
nr.add_segment(s)
segment_file_names.append(nc.name + "/" + nr.name + "/" + s.name + "\n")

nc.dump(self.out_corpus.get_path())
Expand Down Expand Up @@ -203,15 +203,14 @@ def run(self):
"-ar {base_frequency} '{audio_out}/%s'" % (r.audio, perturbed_audio_name)
)

pr = corpus.Recording()
pr = corpus.Recording(corpus=nc)
pr.name = r.name
pr.segments = r.segments
pr.speaker_name = r.speaker_name
pr.speakers = r.speakers
pr.default_speaker = r.default_speaker
pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name
nc.add_recording(pr)
for s in pr.segments:
for s in r.segments:
pr.add_segment(s)
segment_file_names.append(nc.name + "/" + pr.name + "/" + s.name)
s.start /= self.speed_factor
s.end /= self.speed_factor
Expand Down
11 changes: 4 additions & 7 deletions corpus/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,11 @@ def _delete_empty_recordings(corpus: corpus.Corpus, removed_recordings_file: str
:param c: Corpus for which to delete the empty recordings.
:param removed_recordings_file: File in which to dump all recordings that have been deleted.
"""
to_delete = []
for rec in corpus.all_recordings():
if not rec.segments:
to_delete.append(rec)

corpus.remove_recordings(to_delete)
with open(removed_recordings_file, "w") as f:
f.write("\n".join(rec.fullname() for rec in to_delete))
for rec in corpus.all_recordings():
if not rec.segments:
corpus.remove_recording(rec)
f.write(f"{rec.fullname()}\n")


class FilterSegmentsByListJob(Job):
Expand Down
19 changes: 9 additions & 10 deletions corpus/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,22 +70,22 @@ def run(self):
words = [s[1] for s in transcriptions[recording.name]]

if len(words) == 0 and self.remove_empty_segments:
recordings_to_delete = recording
recordings_to_delete = recording # TODO: this does nothing.
continue

segments_to_delete = []
for idx, segment in enumerate(recording.segments):
for segment in recording.segments:
left_idx = bisect.bisect_left(times, segment.start)
right_idx = bisect.bisect_left(times, segment.end)

if left_idx == right_idx and self.remove_empty_segments:
segments_to_delete.append(idx)
segments_to_delete.append(segment)
continue

segment.orth = " ".join(words[left_idx:right_idx]).replace("&", "&amp;")

for sidx in reversed(segments_to_delete):
del recording.segments[sidx]
for segment in reversed(segments_to_delete):
recording.remove_segment(segment)

c.dump(self.output_corpus_path.get_path())

Expand Down Expand Up @@ -223,7 +223,7 @@ def run(self):
sm_entry.value = "/".join([c.name, split_name, segment.name])
sm.map_entries.append(sm_entry)

new_recording_element.segments.append(segment)
new_recording_element.add_segment(segment)
segment_count += 1

# update the time stamp with the recording length and add to ffmpeg merge list
Expand Down Expand Up @@ -547,15 +547,14 @@ def run(self):
nc.speaker_name = c.speaker_name
# store index of last segment
for r in c.recordings:
sr = corpus.Recording()
sr = corpus.Recording(corpus=nc)
sr.name = r.name
sr.segments = r.segments
sr.speaker_name = r.speaker_name
sr.speakers = r.speakers
sr.default_speaker = r.default_speaker
sr.audio = r.audio
nc.add_recording(sr)
for s in sr.segments:
for s in r.segments:
sr.add_segment(s)
segment_file_names.append(nc.name + "/" + sr.name + "/" + s.name)
s.start += self.shift

Expand Down
7 changes: 2 additions & 5 deletions datasets/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,22 +136,19 @@ def run(self):

for transcript in self._transcripts:
name = "{0}-{1}-{2:04d}".format(transcript["speaker_id"], transcript["chapter"], transcript["segment"])
recording = corpus.Recording()
recording = corpus.Recording(corpus=c)
recording.name = name
recording.speaker_name = transcript["speaker_id"]
recording.audio = "{}/{}.flac".format(transcript["path"], name)

used_speaker_ids.add(transcript["speaker_id"])

segment = corpus.Segment()
segment = corpus.Segment(recording=recording)
segment.name = name
segment.start = 0
segment.end = float("inf")
segment.orth = transcript["orth"].strip()

recording.segments.append(segment)
c.recordings.append(recording)

for speaker_id, speaker_info in sorted(self._speakers.items()):
if speaker_id not in used_speaker_ids:
continue
Expand Down
7 changes: 2 additions & 5 deletions datasets/switchboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def run(self):
rec_to_segs.pop("sw02167B")

for rec_name, segs in sorted(rec_to_segs.items()):
recording = corpus.Recording()
recording = corpus.Recording(corpus=c)
recording.name = rec_name
recording.audio = os.path.join(self.audio_dir.get_path(), rec_name + ".wav")

Expand All @@ -272,7 +272,7 @@ def run(self):
rec_speaker_id = rec_to_speaker[rec_name]["speaker_id"]

for seg in segs:
segment = corpus.Segment()
segment = corpus.Segment(recording=recording)
segment.name = seg[0]
segment.start = float(seg[1])
segment.end = float(seg[2])
Expand All @@ -281,9 +281,6 @@ def run(self):
if len(segment.orth) == 0:
continue

recording.segments.append(segment)
c.recordings.append(recording)

# add speakers to corpus
for speaker_info in rec_to_speaker.values():
speaker = corpus.Speaker()
Expand Down
Loading