Skip to content

Commit 9430af2

Browse files
committed
remove rows with # from yodas + better logs
1 parent 2335cad commit 9430af2

File tree

3 files changed

+25
-15
lines changed

3 files changed

+25
-15
lines changed

ssak/utils/kaldi_converter.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -113,15 +113,17 @@ def merge_data(self, dataset, new_data):
113113
diff_a_b = set(dict_dataset.keys()).difference(set(dict_new_data.keys()))
114114
diff_b_a = set(dict_new_data.keys()).difference(set(dict_dataset.keys()))
115115
logger.warning(f"The data you are trying to merge have different lengths at step {self.__class__.__name__} (execute_order={self.execute_order})!")
116-
logger.warning(f"Dataset {len(dataset)} has {len(diff_a_b)} not present in new data")
117-
logger.warning(f"New data {len(new_data)} has {len(diff_b_a)} not present in dataset")
118-
logger.warning("Writing ids to debug.txt")
119-
with open("debug.txt", "w") as f:
120-
if len(diff_a_b) > 0:
121-
f.write("In datset but not in new data:\n")
122-
for i in diff_a_b:
123-
f.write(f"{i}\n")
124-
if len(diff_b_a) > 0:
116+
logger.warning(f"Dataset ({len(dataset)} rows) has {len(diff_a_b)} rows not present in new data")
117+
logger.warning(f"New data ({len(new_data)} rows) has {len(diff_b_a)} rows not present in dataset")
118+
logger.warning("Writing ids to log2kaldi/missing_ids.txt")
119+
os.makedirs("log2kaldi", exist_ok=True)
120+
if len(diff_a_b) > 0:
121+
with open(os.path.join("kaldi_data_processing",f"merge_new_data_missing_{self.execute_order}_{self.__class__.__name__}.txt"), "w") as f:
122+
f.write("In dataset but not in new data:\n")
123+
for i in diff_a_b:
124+
f.write(f"{i}\n")
125+
if len(diff_b_a) > 0:
126+
with open(os.path.join("kaldi_data_processing",f"merge_dataset_missing_{self.execute_order}_{self.__class__.__name__}.txt"), "w") as f:
125127
f.write("In new data but not in dataset:\n")
126128
for i in diff_b_a:
127129
f.write(f"{i}\n")

ssak/utils/kaldi_dataset.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
logger = logging.getLogger(__name__)
1212

13+
LOG_FOLDER = "kaldi_data_processing"
1314

1415
@dataclass
1516
class KaldiDatasetRow:
@@ -266,7 +267,8 @@ def check_if_segments_in_audios(self, acceptance_end_s=0.25):
266267
new_data.append(row)
267268
self.dataset = new_data
268269
logger.info(f"Removed {len(removed_lines)} segments that were not in audios (start or end after audio), check removed_lines_not_in_audios file")
269-
with open("removed_lines_not_in_audios", "w") as f:
270+
os.makedirs(LOG_FOLDER, exist_ok=True)
271+
with open(os.path.join(LOG_FOLDER, "removed_lines_not_in_audios"), "w") as f:
270272
for row in removed_lines:
271273
f.write(str(row) + "\n")
272274

@@ -380,7 +382,8 @@ def normalize_audios(self, output_wavs_conversion_folder, target_sample_rate=160
380382
else:
381383
removed_lines.append(row)
382384
self.dataset = new_dataset
383-
with open("removed_lines_audio_empty", "w") as f:
385+
os.makedirs(LOG_FOLDER, exist_ok=True)
386+
with open(os.path.join(LOG_FOLDER, "removed_lines_audio_empty"), "w") as f:
384387
for row in removed_lines:
385388
f.write(str(row) + "\n")
386389

@@ -594,7 +597,8 @@ def apply_filter(self, filter, filter_out=True):
594597
else:
595598
removed_lines.append(row)
596599
self.dataset = new_data
597-
with open("filtered_out", "w") as f:
600+
os.makedirs(LOG_FOLDER, exist_ok=True)
601+
with open(os.path.join(LOG_FOLDER, "filtered_out"), "w") as f:
598602
for row in removed_lines:
599603
f.write(str(row) + "\n")
600604

tools/kaldi/datasets2kaldi/yodas2kaldi.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
parser = argparse.ArgumentParser(description="Convert yodas dataset to Kaldi format")
1414
parser.add_argument("--force", action="store_true", default=True)
1515
parser.add_argument("--input", type=str, default="/data-server/datasets/audio/transcript/fr/YODAS/fr000")
16-
parser.add_argument("--output", type=str, default="/data-server/datasets/audio/kaldi/fr/YODAS/fr000")
16+
parser.add_argument("--output", type=str, default="/data-server/datasets/audio/kaldi/fr/YODAS/fr000_2")
1717
args = parser.parse_args()
1818

1919
input_dataset = args.input
@@ -52,14 +52,18 @@
5252
spk_ids = Row2Info("id", ["speaker"], 4, None, None)
5353
dev_reader = Reader2Kaldi(input_dataset, processors=[texts, durations, audios, audio_ids, spk_ids])
5454
dataset = dev_reader.load(debug=False, accept_missing_speaker=True)
55-
dataset.normalize_audios(os.path.join(input_dataset, "converted"), target_extension="wav", num_workers=16)
56-
55+
5756
def filter(row):
5857
if row.id.startswith("E--pPwqi_50-"):
5958
return True
59+
elif "#" in row.text:
60+
return True
6061
return False
6162

6263
removed_lines = dataset.apply_filter(filter)
64+
65+
dataset.normalize_audios(os.path.join(input_dataset, "converted"), target_extension="wav", num_workers=16)
66+
6367
logger.info(f"Dataset duration: {dataset.get_duration('sum')/3600:.2f}h")
6468
dataset.save(raw, False)
6569

0 commit comments

Comments
 (0)