linagora-labs
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ssak/utils/align_transcriptions.py‎
Lines changed: 1 addition & 2 deletions b/‎ssak/utils/align_transcriptions.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎ssak/utils/audio.py‎
Lines changed: 2 additions & 1 deletion b/‎ssak/utils/audio.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ssak/utils/kaldi.py‎
Lines changed: 3 additions & 3 deletions b/‎ssak/utils/kaldi.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ssak/utils/kaldi_converter.py‎
Lines changed: 11 additions & 9 deletions b/‎ssak/utils/kaldi_converter.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎ssak/utils/kaldi_dataset.py‎
Lines changed: 31 additions & 4 deletions b/‎ssak/utils/kaldi_dataset.py‎
Lines changed: 31 additions & 4 deletions
diff --git a/‎ssak/utils/monitoring.py‎
Lines changed: 28 additions & 7 deletions b/‎ssak/utils/monitoring.py‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎ssak/utils/text_latin.py‎
Lines changed: 9 additions & 19 deletions b/‎ssak/utils/text_latin.py‎
Lines changed: 9 additions & 19 deletions
diff --git a/‎ssak/utils/wer.py‎
Lines changed: 4 additions & 1 deletion b/‎ssak/utils/wer.py‎
Lines changed: 4 additions & 1 deletion
@@ -1,7 +1,7 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 default_language_version:
-    python: python3.10
+    python: python3
 repos:
 -   repo: https://github.com/charliermarsh/ruff-pre-commit
     rev: v0.1.11
 
@@ -38,6 +38,7 @@ This repository focus on the following features:
 ├── tools/           : Scripts to cope with audio data (data curation, ...)
 │   ├── kaldi/utils/    : Scripts to check and complete kaldi's data folders (.sh and .pl scripts)
 │   ├── LeVoiceLab/     : Scripts to convert data from/to LeVoiceLab format (see https://speech-data-hub.levoicelab.org/)
+│   ├── nemo/           : Scripts to manipulate, prepare and convert data to NeMo format
 │   └── scraping/       : Scripts to scrape a collection of documents (docx, pdf...) or the web
 ├── docker/          : Docker environment
 └── tests/           : Unittest suite
 
@@ -13,8 +13,7 @@
     load_model,
 )
 from ssak.utils.misc import hashmd5
-from ssak.utils.text_basic import transliterate
-from ssak.utils.text_basic import _punctuation
+from ssak.utils.text_basic import _punctuation, transliterate
 from ssak.utils.viewer import PlayWav
 
 imshow_opts = dict(origin="upper", aspect="auto", vmax=0)  # vmin = -25,
 
@@ -75,8 +75,9 @@ def load_audio(path, start=None, end=None, sample_rate=16_000, mono=True, return
             audio, sr = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
         else:
             audio, sr = torchaudio.load(path)
-    if return_format=="librosa":
+    if return_format == "librosa":
         import librosa
+
         offset = float(start if start else 0)
         duration = None
         if end:
 
@@ -58,10 +58,10 @@ def parse_line(line):
 }
 
 
-def check_kaldi_dir(dirname, language=None, strict_sort=False):
+def check_kaldi_dir(dirname, language=None, strict_sort=False, tool_dir=None):
     strict_sort = "true" if strict_sort else "false"
-    tool_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "tools", "kaldi", "utils")
-
+    if not tool_dir:
+        tool_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "tools", "kaldi", "utils")
     if os.path.isfile(os.path.join(dirname, "text")):
         with open(os.path.join(dirname, "text")) as f:
             texts = dict(parse_line(line) for line in f)
 
@@ -113,15 +113,17 @@ def merge_data(self, dataset, new_data):
                 diff_a_b = set(dict_dataset.keys()).difference(set(dict_new_data.keys()))
                 diff_b_a = set(dict_new_data.keys()).difference(set(dict_dataset.keys()))
                 logger.warning(f"The data you are trying to merge have different lengths at step {self.__class__.__name__} (execute_order={self.execute_order})!")
-                logger.warning(f"Dataset {len(dataset)} has {len(diff_a_b)} not present in new data")
-                logger.warning(f"New data {len(new_data)} has {len(diff_b_a)} not present in dataset")
-                logger.warning("Writing ids to debug.txt")
-                with open("debug.txt", "w") as f:
-                    if len(diff_a_b) > 0:
-                        f.write("In datset but not in new data:\n")
-                        for i in diff_a_b:
-                            f.write(f"{i}\n")
-                    if len(diff_b_a) > 0:
+                logger.warning(f"Dataset ({len(dataset)} rows) has {len(diff_a_b)} rows not present in new data")
+                logger.warning(f"New data ({len(new_data)} rows) has {len(diff_b_a)} rows not present in dataset")
+                logger.warning("Writing ids to log2kaldi/missing_ids.txt")
+                os.makedirs("kaldi_data_processing", exist_ok=True)
+                if len(diff_a_b) > 0:
+                    with open(os.path.join("kaldi_data_processing",f"merge_new_data_missing_{self.execute_order}_{self.__class__.__name__}.txt"), "w") as f:
+                            f.write("In dataset but not in new data:\n")
+                            for i in diff_a_b:
+                                f.write(f"{i}\n")
+                if len(diff_b_a) > 0:
+                    with open(os.path.join("kaldi_data_processing",f"merge_dataset_missing_{self.execute_order}_{self.__class__.__name__}.txt"), "w") as f:
                         f.write("In new data but not in dataset:\n")
                         for i in diff_b_a:
                             f.write(f"{i}\n")
 
@@ -10,6 +10,7 @@
 
 logger = logging.getLogger(__name__)
 
+LOG_FOLDER = "kaldi_data_processing"
 
 @dataclass
 class KaldiDatasetRow:
@@ -247,6 +248,30 @@ def get_duration(self, mode=sum, target="segment"):
             return mode(durations)
         return mode([i.duration for i in self.dataset])
 
+    def check_if_segments_in_audios(self, acceptance_end_s=0.25):
+        from pydub.utils import mediainfo
+
+        new_data = []
+        removed_lines = []
+        files_duration = dict()
+        for row in tqdm(self, desc="Check if segments are in audios"):
+            if row.audio_path not in files_duration:
+                dur = round(float(mediainfo(row.audio_path)["duration"]), 3)
+                files_duration[row.audio_path] = dur
+            dur = files_duration[row.audio_path]
+            if row.start >= dur:
+                removed_lines.append(row)
+            elif row.end > dur + acceptance_end_s:
+                removed_lines.append(row)
+            else:
+                new_data.append(row)
+        self.dataset = new_data
+        logger.info(f"Removed {len(removed_lines)} segments that were not in audios (start or end after audio), check removed_lines_not_in_audios file")
+        os.makedirs(LOG_FOLDER, exist_ok=True)
+        with open(os.path.join(LOG_FOLDER, "removed_lines_not_in_audios"), "w") as f:
+            for row in removed_lines:
+                f.write(str(row) + "\n")
+
     def filter_by_audio_ids(self, audio_ids):
         """
         Filter the dataset by audio ids
@@ -281,7 +306,7 @@ def filter_by_speakers(self, speakers):
                 new_dataset.append(row)
         return new_dataset
 
-    def normalize_dataset(self, apply_text_normalization=True):
+    def normalize_dataset(self, apply_text_normalization=True, wer_format=False):
         """
         Normalize the texts in the dataset using the format_text_latin function from ssak.utils.text_latin
 
@@ -296,7 +321,7 @@ def normalize_dataset(self, apply_text_normalization=True):
         for row in tqdm(self.dataset, total=len(self.dataset), desc="Normalizing texts"):
             from ssak.utils.text_latin import format_text_latin
 
-            row.normalized_text = format_text_latin(row.text)
+            row.normalized_text = format_text_latin(row.text, wer_format=wer_format)
             if apply_text_normalization:
                 row.text = row.normalized_text
 
@@ -357,7 +382,8 @@ def normalize_audios(self, output_wavs_conversion_folder, target_sample_rate=160
                 else:
                     removed_lines.append(row)
             self.dataset = new_dataset
-            with open("removed_lines", "w") as f:
+            os.makedirs(LOG_FOLDER, exist_ok=True)
+            with open(os.path.join(LOG_FOLDER, "removed_lines_audio_empty"), "w") as f:
                 for row in removed_lines:
                     f.write(str(row) + "\n")
 
@@ -571,7 +597,8 @@ def apply_filter(self, filter, filter_out=True):
             else:
                 removed_lines.append(row)
         self.dataset = new_data
-        with open("filtered_out", "w") as f:
+        os.makedirs(LOG_FOLDER, exist_ok=True)
+        with open(os.path.join(LOG_FOLDER, f"filtered_out_with_{filter.__name__ }"), "w") as f:
             for row in removed_lines:
                 f.write(str(row) + "\n")
 
 
@@ -398,6 +398,7 @@ class Monitoring:
 
     def __init__(self, output_folder="", name="", interval=0.25, device="cuda", plot_monitoring=True, show_steps_in_plots=True):
         self.device = device
+        self.device_name = None
         self.output_folder = output_folder
         if not name:
             self.name = output_folder
@@ -408,13 +409,25 @@ def __init__(self, output_folder="", name="", interval=0.25, device="cuda", plot
         self.will_plot_monitoring = plot_monitoring
         if self.will_plot_monitoring:
             pass
+        self.device = self.device if self.device else 0
+        if self.device=="cuda" or self.device == "gpu":
+            self.device = 0
+        elif self.device.startswith("cuda:"):
+            self.device = int(self.device.split(":")[1])
+        if self.device != "cpu" and isinstance(self.device, int):
+            num_gpus = get_num_gpus()
+            if self.device>num_gpus:
+                raise ValueError(f"GPU {self.device} doesn't exist, only {num_gpus} GPUs available")
+            self.device = ALL_GPU_INDICES[self.device]
+        elif self.device != "cpu":
+            raise ValueError(f"Device {self.device} doesn't exist, use 'gpu', 'cpu', 'cuda', 'cuda:0' or '0' for example")
 
     def _finish_step(self, monitoring, step_values, step=0, start=0):
         for i in step_values:
             if i not in monitoring:
                 monitoring[i] = []
             monitoring[i].extend(step_values[i])
-        if self.steps and len(self.steps)>0:
+        if self.steps and len(self.steps) > 0 and step < len(self.steps):
             if "steps" not in monitoring:
                 monitoring["steps"] = []
             if "steps_end" not in monitoring:
@@ -464,9 +477,11 @@ def _monitor(self):
             start = time.time() - monitoring["time_points"][-1]
             if "device" in monitoring and monitoring["device"] != (pynvml.nvmlDeviceGetName(handle) if handle else "cpu"):
                 raise ValueError("The device used in the monitoring is different from the one specified in the current monitoring")
+            self.device_name = monitoring.get("device", "cpu")
         else:
             monitoring = dict()
             monitoring["device"] = pynvml.nvmlDeviceGetName(handle) if handle else "cpu"
+            self.device_name = monitoring["device"]
             start = time.time()
         step = 0
         step_monitoring = dict()
@@ -498,12 +513,6 @@ def start(self, steps=None):
             steps: list of str
                 List of steps to monitor
         """
-        self.device = self.device if self.device else 0
-        if self.device == "cuda" or self.device == "gpu":
-            self.device = 0
-        if self.device != "cpu":
-            get_num_gpus()
-            self.device = ALL_GPU_INDICES[self.device]
         self.event_stop = threading.Event()
         self.event_next = threading.Event()
         self.event_error = threading.Event()
@@ -530,6 +539,18 @@ def stop(self, error=False):
             self.event_stop.set()
         self.monitoring_thread.join()
 
+    def get_device_name(self):
+        if self.device_name is None:
+            if self.device != "cpu":
+                pynvml.nvmlInit()
+                handle = pynvml.nvmlDeviceGetHandleByIndex(self.device)
+            else:
+                handle = None
+            self.device_name = pynvml.nvmlDeviceGetName(handle) if handle else "cpu"
+            if handle:
+                pynvml.nvmlShutdown()
+        return self.device_name
+
     def plot_hardware(self, values, times, output_folder, ylabel="RAM Usage", lims=None, steps=None):
         import matplotlib.pyplot as plt
 
 
@@ -3,18 +3,16 @@
 
 from ssak.utils.text_basic import (
     _punctuation,
+    collapse_whitespace,
+    format_special_characters,
+    remove_punctuations,
+    transliterate,
 )
 from ssak.utils.text_utils import (
     numbers_and_symbols_to_letters,
     regex_escape,
     remove_special_characters,
 )
-from ssak.utils.text_basic import (
-    collapse_whitespace,
-    format_special_characters,
-    remove_punctuations,
-    transliterate,
-)
 
 
 def _rm_key(d, key):
@@ -42,17 +40,7 @@ def find_acronyms(text, ignore_first_upper_words=True):
 
 
 def format_text_latin(
-    text,
-    lang="fr",
-    lower_case=True,
-    keep_punc=False,
-    remove_ligatures=True,
-    convert_numbers=True,
-    extract_parenthesis=False,
-    fid_acronyms=None,
-    fid_special_chars=None,
-    safety_checks=True,
-    remove_suspicious_entry=False,
+    text, lang="fr", lower_case=True, keep_punc=False, remove_ligatures=True, convert_numbers=True, extract_parenthesis=False, fid_acronyms=None, fid_special_chars=None, safety_checks=True, remove_suspicious_entry=False, wer_format=True
 ):
     opts = _rm_key(locals(), "text")
 
@@ -139,7 +127,10 @@ def format_text_latin(
             text = re.sub(":", " : ", text)
             text = re.sub(";", " ; ", text)
         # text = re.sub("^ *-+", "", text)
-        text = re.sub("'", "' ", text)
+        if wer_format:
+            text = re.sub("'", "' ", text)
+        else:
+            text = re.sub("' ", "'", text)
         text = re.sub(r"\^+", "", text)
         text = re.sub(" +(- +)+", " ", text)
         text = re.sub("- ", " ", text)
@@ -171,7 +162,6 @@ def format_text_latin(
                 # text_rep=split_h[0]+' heures '+split_h[1]
                 text = text.replace(h, text_rep)
 
-        if convert_numbers:
             text = numbers_and_symbols_to_letters(text, lang=lang)
 
             if lang == "fr":
 
@@ -155,6 +155,7 @@ def further_normalize(s):
 
         if strong_normalization:
             from ssak.utils.text_basic import collapse_whitespace
+
             def remove_not_words(s):
                 # Remove any character that is not alpha-numeric (e.g. apostrophes, dashes, ...)
                 return collapse_whitespace(re.sub(r"[^\w]", " ", s))
@@ -781,7 +782,9 @@ def func_ylabel(title, *args, **kwargs):
         plt.legend(
             fontsize=label_size,
             ncols=2,
-            loc="best",
+            # loc="best",
+            loc='upper left', 
+            bbox_to_anchor=(1, 1)
         )
     if show_axisnames:
         use_percent = scale == 100
Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,7 @@`
`13`	`13`	`load_model,`
`14`	`14`	`)`
`15`	`15`	`from ssak.utils.misc import hashmd5`
`16`		`-from ssak.utils.text_basic import transliterate`
`17`		`-from ssak.utils.text_basic import _punctuation`
	`16`	`+from ssak.utils.text_basic import _punctuation, transliterate`
`18`	`17`	`from ssak.utils.viewer import PlayWav`
`19`	`18`
`20`	`19`	`imshow_opts = dict(origin="upper", aspect="auto", vmax=0) # vmin = -25,`