Sanitizes script text to improve speaker detection and prevent voice loading issues.

laurentftech · laurentftech · commit 7afe08bee54e · 2025-11-12T21:16:38.000+01:00
diff --git a/generate_podcast.py b/generate_podcast.py
@@ -18,7 +18,7 @@
 
 import re
 import requests
-from utils import get_app_data_dir, find_ffmpeg_path, sanitize_app_settings_for_backend
+from utils import get_app_data_dir, find_ffmpeg_path, sanitize_app_settings_for_backend, sanitize_text
 
 # Global logger instance - initialized once when module is imported
 logger = logging.getLogger(__name__)
@@ -566,6 +566,9 @@ def generate(script_text: str, app_settings: dict, output_filepath: str, status_
     logger.info("Starting generation function.")
     status_callback("Starting podcast generation...")
 
+    # Sanitize the script text at the entry point of the generation logic
+    sanitized_script_text = sanitize_text(script_text)
+
     ffmpeg_path = find_ffmpeg_path()
     if not ffmpeg_path:
         status_callback("--- CRITICAL ERROR ---")
@@ -592,12 +595,12 @@ def generate(script_text: str, app_settings: dict, output_filepath: str, status_
     if provider_name == "gemini":
         speaker_mapping = (app_settings or {}).get("speaker_voices", {})
         provider = GeminiTTS(api_key=api_key)
-        return provider.synthesize(script_text=script_text, speaker_mapping=speaker_mapping,
+        return provider.synthesize(script_text=sanitized_script_text, speaker_mapping=speaker_mapping,
                                    output_filepath=output_filepath, status_callback=status_callback)
     else:
         speaker_mapping = (app_settings or {}).get("speaker_voices_elevenlabs", {})
         provider = ElevenLabsTTS(api_key=api_key)
-        return provider.synthesize(script_text=script_text, speaker_mapping=speaker_mapping,
+        return provider.synthesize(script_text=sanitized_script_text, speaker_mapping=speaker_mapping,
                                    output_filepath=output_filepath, status_callback=status_callback)
 
 
@@ -735,7 +738,7 @@ def sanitize_app_settings_for_backend(app_settings: Dict[str, Any]) -> Dict[str,
 
     temp_script_file_path = None
     if args.script_text:
-        script_text = args.script_text
+        script_text = sanitize_text(args.script_text)
         script_source_description = "the provided text"
         if not args.output_filepath:
             parser.error("argument --output is required when using --script-text.")
@@ -751,7 +754,7 @@ def sanitize_app_settings_for_backend(app_settings: Dict[str, Any]) -> Dict[str,
     else:  # script_filepath is guaranteed to be not None here
         try:
             with open(args.script_filepath, 'r', encoding='utf-8') as f:
-                script_text = f.read()
+                script_text = sanitize_text(f.read())
             script_filepath_for_demo = args.script_filepath
             script_source_description = f"'{os.path.basename(args.script_filepath)}'"
         except FileNotFoundError:
diff --git a/gui.py b/gui.py
@@ -38,7 +38,7 @@
 from about_window import AboutWindow
 from api_keys_window import APIKeysWindow
 from generate_podcast import validate_speakers, update_elevenlabs_quota
-from utils import get_asset_path, sanitize_app_settings_for_backend, find_ffplay_path, get_app_data_dir
+from utils import get_asset_path, sanitize_app_settings_for_backend, find_ffplay_path, get_app_data_dir, sanitize_text
 from create_demo import create_html_demo_whisperx
 
 # --- Versioning ---
@@ -1010,9 +1010,11 @@ def load_script_from_file(self):
 
         try:
             with open(filepath, 'r', encoding='utf-8') as f:
+                content = f.read()
+                sanitized_content = sanitize_text(content)
                 self.script_text.delete('1.0', tk.END)
-                self.script_text.insert('1.0', f.read())
-            self.log_status(f"Script loaded from: {os.path.basename(filepath)}")
+                self.script_text.insert('1.0', sanitized_content)
+            self.log_status(f"Script loaded and sanitized from: {os.path.basename(filepath)}")
         except Exception as e:
             messagebox.showerror("Reading error", f"Cannot read the file:\n{e}", parent=self.root)
             self.logger.error(f"Error reading the script: {e}")
@@ -1025,11 +1027,13 @@ def start_generation_thread(self):
                                    parent=self.root)
             return
 
-        self.last_generated_script = script_content  # Store script for demo
+        # Sanitize the script content before using it
+        sanitized_script = sanitize_text(script_content)
+        self.last_generated_script = sanitized_script  # Store sanitized script for demo
 
         # --- Validate Speaker Voices ---
         try:
-            missing_speakers, configured_speakers = validate_speakers(script_content, self.app_settings)
+            missing_speakers, configured_speakers = validate_speakers(sanitized_script, self.app_settings)
         except ValueError as e:
             # Règle Gemini: plus de 2 speakers -> erreur bloquante
             messagebox.showerror("Configuration Error", str(e), parent=self.root)
@@ -1091,7 +1095,7 @@ def start_generation_thread(self):
 
         thread = threading.Thread(
             target=self.run_generation,
-            args=(script_content, output_filepath, self.app_settings, self.api_key)
+            args=(sanitized_script, output_filepath, self.app_settings, self.api_key)
         )
         thread.daemon = True
         thread.start()
diff --git a/utils.py b/utils.py
@@ -2,7 +2,9 @@
 import sys
 import shutil
 from typing import Optional, Dict, Any
-
+import re
+import unicodedata
+from html import unescape
 
 def get_asset_path(filename: str) -> Optional[str]:
     """
@@ -57,6 +59,37 @@ def find_ffplay_path() -> Optional[str]:
     return _find_command_path("ffplay")
 
 
+def sanitize_text(text: str) -> str:
+    if not text:
+        return ""
+
+    # 1️⃣ Enlève le HTML ou XML résiduel (ex : <p>, <o:p> de Word)
+    text = re.sub(r"<[^>]+>", " ", text)
+
+    # 2️⃣ Décode les entités HTML (ex: &nbsp;, &amp;)
+    text = unescape(text)
+
+    # 3️⃣ Normalise les caractères unicode (accents, quotes, symboles)
+    text = unicodedata.normalize("NFKC", text)
+
+    # 4️⃣ Remplace les espaces insécables et similaires par des espaces normaux
+    text = re.sub(r"[\u00A0\u2000-\u200B\u202F\u205F\u3000]", " ", text)
+
+    # 5️⃣ Supprime les caractères de contrôle invisibles (retours chariots bizarres, etc.)
+    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)
+
+    # 6️⃣ Remplace les guillemets Word “smart quotes” par des guillemets simples
+    text = text.translate(str.maketrans({
+        "“": '"', "”": '"',
+        "‘": "'", "’": "'",
+        "–": "-", "—": "-", "•": "-"
+    }))
+
+    # 7️⃣ Réduit les espaces multiples
+    text = re.sub(r"\s+", " ", text).strip()
+
+    return text
+
 def sanitize_app_settings_for_backend(app_settings: Dict[str, Any]) -> Dict[str, Any]:
     """
     Creates a "clean" version of app_settings suitable for the backend.