Skip to content

Commit 8c294b3

Browse files
committed
add custom corpus transformation jobs
1 parent 6ad1a0c commit 8c294b3

File tree

1 file changed

+90
-1
lines changed

1 file changed

+90
-1
lines changed

users/rossenbach/corpus/transform.py

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
1+
import os.path
2+
from typing import Iterator
3+
14
from sisyphus import Job, Task, tk
25
import enum
36
import random
47
import numpy as np
58

9+
from i6_core.lib import corpus
610
from i6_core.lib.corpus import Corpus, Recording, Segment
711
from i6_core.lib.lexicon import Lexicon
8-
from i6_core.corpus.transform import ApplyLexiconToCorpusJob
12+
from i6_core.corpus.transform import ApplyLexiconToCorpusJob, MergeStrategy, MergeCorporaJob
913

1014
class LexiconStrategy(enum.Enum):
1115
PICK_FIRST = 0
@@ -102,3 +106,88 @@ def run(self):
102106

103107
c.dump(self.out_corpus.get())
104108

109+
110+
class RandomAssignSpeakersFromCorpus(Job):
111+
"""
112+
Takes another bliss corpus as speaker reference and randomly distributes speaker tags
113+
114+
Used e.g. for synthetic TTS data
115+
"""
116+
117+
def __init__(self, bliss_corpus: tk.Path, speaker_reference_bliss_corpus: tk.Path, seed: int = 42):
118+
"""
119+
120+
:param bliss_corpus: bliss corpus to assign speakers to
121+
:param speaker_reference_bliss_corpus: bliss corpus to take speakers from
122+
:param seed: random seed for deterministic behavior
123+
"""
124+
self.bliss_corpus = bliss_corpus
125+
self.speaker_reference_bliss_corpus = speaker_reference_bliss_corpus
126+
self.seed = seed
127+
128+
self.out_corpus = self.output_path("corpus.xml.gz")
129+
130+
def tasks(self) -> Iterator[Task]:
131+
yield Task("run", mini_task=True)
132+
133+
def run(self):
134+
# set seed for deterministic behavior
135+
np.random.seed(self.seed)
136+
137+
out_corpus = Corpus()
138+
out_corpus.load(self.bliss_corpus.get_path())
139+
140+
speaker_corpus = Corpus()
141+
speaker_corpus.load(self.speaker_reference_bliss_corpus.get_path())
142+
143+
out_corpus.speakers = speaker_corpus.speakers
144+
speaker_name_list = list(out_corpus.speakers.keys())
145+
num_speakers = len(speaker_name_list)
146+
147+
for recording in out_corpus.all_recordings():
148+
recording.speaker_name = None
149+
recording.default_speaker = None
150+
for segment in recording.segments:
151+
segment.speaker_name = speaker_name_list[np.random.randint(num_speakers)]
152+
153+
out_corpus.dump(self.out_corpus.get_path())
154+
155+
156+
class MergeCorporaWithPathResolveJob(MergeCorporaJob):
157+
"""
158+
Merges Bliss Corpora files into a single file as subcorpora or flat
159+
160+
resolves relative paths to absolute
161+
"""
162+
163+
def run(self):
164+
merged_corpus = corpus.Corpus()
165+
merged_corpus.name = self.name
166+
for corpus_path in self.bliss_corpora:
167+
c = corpus.Corpus()
168+
c.load(corpus_path.get_path())
169+
170+
# Make all audio paths absolute
171+
corpus_dir = os.path.dirname(corpus_path.get_path())
172+
for recording in c.all_recordings():
173+
absolute_audio = os.path.join(corpus_dir, recording.audio)
174+
assert os.path.exists(absolute_audio)
175+
recording.audio = absolute_audio
176+
177+
if self.merge_strategy == MergeStrategy.SUBCORPORA:
178+
merged_corpus.add_subcorpus(c)
179+
elif self.merge_strategy == MergeStrategy.FLAT:
180+
for rec in c.all_recordings():
181+
merged_corpus.add_recording(rec)
182+
merged_corpus.speakers.update(c.speakers)
183+
elif self.merge_strategy == MergeStrategy.CONCATENATE:
184+
for subcorpus in c.top_level_subcorpora():
185+
merged_corpus.add_subcorpus(subcorpus)
186+
for rec in c.top_level_recordings():
187+
merged_corpus.add_recording(rec)
188+
for speaker in c.top_level_speakers():
189+
merged_corpus.add_speaker(speaker)
190+
else:
191+
assert False, "invalid merge strategy"
192+
193+
merged_corpus.dump(self.out_merged_corpus.get_path())

0 commit comments

Comments
 (0)