Skip to content

Commit 7b3b2db

Browse files
committed
add more checks and filtering in nemo pipeline
1 parent f6f7213 commit 7b3b2db

File tree

5 files changed

+104
-5
lines changed

5 files changed

+104
-5
lines changed

ssak/utils/kaldi_dataset.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,29 @@ def get_duration(self, mode=sum, target="segment"):
247247
return mode(durations)
248248
return mode([i.duration for i in self.dataset])
249249

250+
def check_if_segments_in_audios(self, acceptance_end_s=0.25):
251+
from pydub.utils import mediainfo
252+
253+
new_data = []
254+
removed_lines = []
255+
files_duration = dict()
256+
for row in tqdm(self, desc="Check if segments are in audios"):
257+
if row.audio_path not in files_duration:
258+
dur = round(float(mediainfo(row.audio_path)["duration"]), 3)
259+
files_duration[row.audio_path] = dur
260+
dur = files_duration[row.audio_path]
261+
if row.start >= dur:
262+
removed_lines.append(row)
263+
elif row.end > dur + acceptance_end_s:
264+
removed_lines.append(row)
265+
else:
266+
new_data.append(row)
267+
self.dataset = new_data
268+
logger.info(f"Removed {len(removed_lines)} segments that were not in audios (start or end after audio), check removed_lines_not_in_audios file")
269+
with open("removed_lines_not_in_audios", "w") as f:
270+
for row in removed_lines:
271+
f.write(str(row) + "\n")
272+
250273
def filter_by_audio_ids(self, audio_ids):
251274
"""
252275
Filter the dataset by audio ids
@@ -357,7 +380,7 @@ def normalize_audios(self, output_wavs_conversion_folder, target_sample_rate=160
357380
else:
358381
removed_lines.append(row)
359382
self.dataset = new_dataset
360-
with open("removed_lines", "w") as f:
383+
with open("removed_lines_audio_empty", "w") as f:
361384
for row in removed_lines:
362385
f.write(str(row) + "\n")
363386

tools/nemo/convert_kaldi_dataset_to_nemo.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55

6+
from find_long_transcriptions import filter_incoherent_segments
67
from tqdm import tqdm
78

89
from ssak.utils.kaldi_dataset import KaldiDataset
@@ -49,7 +50,7 @@ def kaldi_to_nemo(kaldi_dataset, output_file):
4950
f.write("\n")
5051

5152

52-
def convert_dataset(kaldi_input_dataset, output_dir, new_audio_folder=None, check_audio=False):
53+
def convert_dataset(kaldi_input_dataset, output_dir, new_audio_folder=None, check_audio=False, check_if_in_audio=False, remove_incoherent_texts=False):
5354
logger.info(f"Converting Kaldi dataset {kaldi_input_dataset} to NeMo format")
5455
splitted_path = kaldi_input_dataset.split(os.sep)
5556
if splitted_path[-1] == "":
@@ -82,9 +83,15 @@ def convert_dataset(kaldi_input_dataset, output_dir, new_audio_folder=None, chec
8283
target_extension="wav",
8384
num_workers=6,
8485
) # wavs are faster to load than mp3
86+
if check_if_in_audio:
87+
logger.info("Check if segments are in audios")
88+
kaldi_dataset.check_if_segments_in_audios()
8589
logger.info(f"Writing to {file}")
8690
os.makedirs(output_dir, exist_ok=True)
8791
kaldi_to_nemo(kaldi_dataset, file)
92+
if remove_incoherent_texts:
93+
logger.info("Check for incoherent texts (very long text with a short audio segment)")
94+
filter_incoherent_segments(file, file + "_removed_lines")
8895
logger.info(f"Conversion done (saved to {len(kaldi_dataset)} lines to {file})")
8996

9097

tools/nemo/convert_kaldi_datasets_to_nemo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
logger = logging.getLogger(__name__)
1010

1111

12-
def convert_datasets(inputs: list, output_file, output_wav_dir=None, check_audio=False):
12+
def convert_datasets(inputs: list, output_file, output_wav_dir=None, check_audio=False, check_if_in_audio=False, remove_incoherent_texts=False):
1313
input_files = inputs
1414
if len(input_files) == 1:
1515
logger.warning("One input file, considering it as containing a list of files")
@@ -20,7 +20,7 @@ def convert_datasets(inputs: list, output_file, output_wav_dir=None, check_audio
2020
raise FileNotFoundError(f"Non-existing file {input_folder}")
2121
if not os.path.isdir(input_folder):
2222
raise NotADirectoryError(f"File {input_folder} is not a directory")
23-
convert_dataset(input_folder, output_file, output_wav_dir, check_audio=check_audio)
23+
convert_dataset(input_folder, output_file, output_wav_dir, check_audio=check_audio, check_if_in_audio=check_if_in_audio, remove_incoherent_texts=remove_incoherent_texts)
2424
logger.info(f"Finished converting datasets from {input_files} to {output_file}")
2525

2626

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import argparse
2+
import json
3+
import logging
4+
import shutil
5+
6+
from tqdm import tqdm
7+
8+
logging.basicConfig(level=logging.INFO)
9+
logger = logging.getLogger(__name__)
10+
11+
# En règle générale, les conversations entre adulte se font à un débit de 200 mots/minute.
12+
# Certains montent la cadence jusqu’à 300 mots/minute. Alors que pour des enregistrements audio par exemple, on préconise un rythme de 150 mots/minute pour être bien audible.
13+
# a bit less than 5 characters per word, 4,65 for journalistic, and 4,89 for litterature
14+
# if we add 2 more characters (spaces) it means we have around 7 characters per word (8 to be large)
15+
# fast reading person: 160wpm (2.7wps)
16+
# speaking person: 200wpm (3.3wps)
17+
# ultra fast: 300wpm (5wps) (probably possible to happen for very short duration but not for more than a few sec)
18+
# so for ultra fast 1s we have around 5*8=40 characters
19+
# So, we can safely say that we it can't go over 50
20+
# 5s: 3.3*8*5=132c/ 5*8*5=200
21+
# 10s: 3.3*8*10=264c/ 400c
22+
# 20s: 528c
23+
# 30s: 792c
24+
25+
INCOHERENT_THREEHOLD = {1: 50, 5: 200, 10: 350, 20: 550, 30: 700}
26+
27+
28+
def filter_incoherent_segments(input_file, filtered_out_file):
29+
with open(input_file, encoding="utf-8") as f:
30+
lines = f.readlines()
31+
data = [json.loads(l) for l in lines]
32+
ct_dict = {i: 0 for i in list(INCOHERENT_THREEHOLD.values())}
33+
ct = 0
34+
with open(input_file + ".tmp", "w", encoding="utf-8") as f, open(filtered_out_file, "w", encoding="utf-8") as log:
35+
for i, row in enumerate(tqdm(data, desc="Checking for incoherent texts lengths")):
36+
dur = float(row["duration"])
37+
max_text = None
38+
for k, v in INCOHERENT_THREEHOLD.items():
39+
if dur < k:
40+
max_text = v
41+
break
42+
if max_text is None:
43+
max_text = list(INCOHERENT_THREEHOLD.values())[-1]
44+
if len(row["text"]) > max_text:
45+
ct += 1
46+
ct_dict[max_text] = ct_dict[max_text] + 1
47+
json.dump(row, log, ensure_ascii=False)
48+
log.write("\n")
49+
else:
50+
json.dump(row, f, ensure_ascii=False)
51+
f.write("\n")
52+
print(f"Find {ct} long texts in {input_file}")
53+
print(f"Removed: {ct_dict}")
54+
shutil.move(input_file + ".tmp", input_file)
55+
56+
57+
if __name__ == "__main__":
58+
parser = argparse.ArgumentParser(description="Remove incoherent lines from nemo manifest")
59+
parser.add_argument("file", help="Input file", type=str)
60+
parser.add_argument("output", help="output file", type=str)
61+
# parser.add_argument('--max_char', help="Depends on segments max length", type=int, default=700)
62+
args = parser.parse_args()
63+
filter_incoherent_segments(args.file, args.output)

tools/nemo/pipeline_prepare_nemo_data.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
from generate_dataset_list_files import generate_dataset_list_files
88
from merge_manifest import merge_manifests
99

10+
CHECK_AUDIO = True
11+
CHECK_IF_SEGMENT_IN_AUDIO = False
12+
REMOVE_INCOHERENT_TEXTS = True
13+
1014
if __name__ == "__main__":
1115
parser = argparse.ArgumentParser(description="Prepare data for Nemo")
1216
parser.add_argument("--train_input_datasets", help="Input datasets", type=str, default=None)
@@ -103,7 +107,9 @@
103107
[os.path.join(tmp_manifest_dir, "datasets_list", f"{i}_datasets")],
104108
os.path.join(tmp_manifest_dir, f"{i}_manifests"),
105109
output_wav_dir,
106-
check_audio=True,
110+
check_audio=CHECK_AUDIO,
111+
check_if_in_audio=CHECK_IF_SEGMENT_IN_AUDIO,
112+
remove_incoherent_texts=REMOVE_INCOHERENT_TEXTS,
107113
)
108114
except FileExistsError:
109115
pass

0 commit comments

Comments
 (0)