Skip to content

Commit 701e21d

Browse files
committed
using tedlium v2
1 parent fe597c8 commit 701e21d

File tree

8 files changed

+609
-0
lines changed

8 files changed

+609
-0
lines changed

common/baselines/tedlium2_v2/__init__.py

Whitespace-only changes.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from collections import defaultdict
2+
from typing import Dict
3+
4+
from i6_experiments.common.datasets.tedlium2.constants import CONCURRENT
5+
from i6_experiments.common.datasets.tedlium2_v2.corpus import get_corpus_object_dict
6+
from i6_experiments.common.datasets.tedlium2_v2.lexicon import (
7+
get_g2p_augmented_bliss_lexicon,
8+
)
9+
from i6_experiments.common.setups.rasr.util import RasrDataInput
10+
11+
from i6_experiments.common.setups.rasr.config.lex_config import (
12+
LexiconRasrConfig,
13+
)
14+
from i6_experiments.common.setups.rasr.config.lm_config import ArpaLmRasrConfig
15+
from i6_experiments.common.baselines.tedlium2_v2.lm.ngram_config import run_tedlium2_ngram_lm
16+
17+
18+
def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dict[str, Dict[str, RasrDataInput]]:
19+
corpus_object_dict = get_corpus_object_dict(audio_format="wav", output_prefix="corpora")
20+
21+
train_lexicon = LexiconRasrConfig(
22+
get_g2p_augmented_bliss_lexicon(
23+
add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping, output_prefix="lexicon"
24+
),
25+
False,
26+
)
27+
28+
lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping)
29+
lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
30+
comb_lm = ArpaLmRasrConfig(lm_path=lm.ngram_lm)
31+
32+
rasr_data_input_dict = defaultdict(dict)
33+
34+
for name, crp_obj in corpus_object_dict.items():
35+
rasr_data_input_dict[name][name] = RasrDataInput(
36+
corpus_object=crp_obj,
37+
lexicon=train_lexicon.get_dict(),
38+
concurrent=CONCURRENT[name],
39+
lm=comb_lm.get_dict() if name == "dev" or name == "test" else None,
40+
)
41+
42+
return rasr_data_input_dict
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""
2+
List of default tools and software to be defined as default independent from hashing
3+
by setting one explicit hash.
4+
5+
In order to use different software paths without hash changes, just use the same explicit hash string as given here.
6+
7+
If you want a stronger guarantee that you get the intended results, please consider using the explicit software
8+
version listed here. Nevertheless, the most recent "head" should be safe to be used as well.
9+
"""
10+
from sisyphus import tk
11+
from i6_experiments.common.tools.audio import compile_ffmpeg_binary
12+
from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode
13+
from i6_experiments.common.tools.sctk import compile_sctk
14+
15+
RASR_BINARY_PATH = compile_rasr_binaries_i6mode(
16+
branch="apptainer_tf_2_8", configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"]
17+
) # use most recent RASR
18+
# RASR_BINARY_PATH = tk.Path("/work/asr4/rossenbach/neon_test/rasr_versions/rasr_no_tf/arch/linux-x86_64-standard/")
19+
assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline"
20+
RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH"
21+
22+
23+
SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version
24+
SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH"
25+
26+
SRILM_PATH = tk.Path("/work/tools/users/luescher/srilm-1.7.3/bin/i686-m64/")
27+
SRILM_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SRILM_PATH"

common/baselines/tedlium2_v2/gmm/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)