Merge branch 'main' into feature_added-ollama

bram · bram · commit 2dd98aaf1831 · 2025-10-08T16:06:15.000+02:00
diff --git a/README.md b/README.md
@@ -73,20 +73,22 @@ export AZURE_OPENAI_API_VERSION='2024-02-01'
 ### Or Use Ollama (Local, No API Key Needed)
 
 ```bash
-# Install Ollama
+# 1. Install Ollama
 curl -fsSL https://ollama.com/install.sh | sh
 
-# Pull a model (qwen2.5 recommended for multilingual)
-ollama pull qwen2.5
+# 2. Pull a model
+ollama pull qwen2.5    # Best for multilingual (Arabic, Chinese, etc.)
+# OR
+ollama pull llama3.2   # Fast for European languages
 
-# Use it (no API key required!)
+# 3. Translate (no API key required!)
 gpt-po-translator --provider ollama --folder ./locales
 
-# For non-Latin scripts (Arabic, Chinese, etc.), omit --bulk for better quality
+# For non-Latin scripts, use qwen2.5 WITHOUT --bulk
 gpt-po-translator --provider ollama --model qwen2.5 --folder ./locales --lang ar
 ```
 
-> **💡 Tip:** For Ollama with non-Latin languages, **don't use `--bulk`** mode. Single-item translation produces better results since the model doesn't need to handle JSON formatting.
+> **💡 Important:** For Ollama with **non-Latin languages** (Arabic, Chinese, Japanese, etc.), **omit the `--bulk` flag**. Single-item translation is more reliable because the model doesn't have to format responses as JSON.
 
 ## 💡 Usage Examples
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -426,8 +426,6 @@ gpt-po-translator --folder ./locales --lang de --no-ai-comment
 
 ---
 
-<<<<<<< Updated upstream
-=======
 ## Using Ollama (Local AI Provider)
 
 ### Overview
@@ -806,7 +804,6 @@ Both modes use the same preservation logic, ensuring consistent behavior.
 
 ---
 
->>>>>>> Stashed changes
 ## Behind the Scenes: API Calls and Post-Processing
 
 - **Provider-Specific API Calls:**  
diff --git a/python_gpt_po/services/translation_service.py b/python_gpt_po/services/translation_service.py
@@ -206,6 +206,9 @@ def perform_translation_without_validation(
             target_language: str,
             detail_language: Optional[str] = None) -> str:
         """Performs translation without validation for single words or short phrases."""
+        # Strip text before sending to AI (whitespace will be restored in validate_translation)
+        text_stripped = text.strip()
+
         # Use the detailed language name if provided, otherwise use the short code
         target_lang_text = detail_language if detail_language else target_language
 
@@ -216,7 +219,7 @@ def perform_translation_without_validation(
         )
 
         return self.validate_translation(text, self.perform_translation(
-            prompt + text, target_language, is_bulk=False, detail_language=detail_language
+            prompt + text_stripped, target_language, is_bulk=False, detail_language=detail_language
         ), target_language)
 
     @staticmethod
@@ -231,6 +234,7 @@ def get_translation_prompt(target_language: str, is_bulk: bool, detail_language:
                 "Provide only the translations in a JSON array format, maintaining the original order. "
                 "Each translation should be concise and direct, without explanations or additional context. "
                 "Keep special characters, placeholders, and formatting intact. "
+                "Do NOT add or remove any leading/trailing whitespace - translate only the text content. "
                 "If a term should not be translated (like 'URL' or technical terms), keep it as is. "
                 "Example format: [\"Translation 1\", \"Translation 2\", ...]\n\n"
                 "Texts to translate:\n"
@@ -253,15 +257,21 @@ def perform_translation(
         """Performs the actual translation using the selected provider's API."""
         logging.debug("Translating to '%s' via %s API", target_language, self.config.provider.value)
         prompt = self.get_translation_prompt(target_language, is_bulk, detail_language)
-        content = prompt + (json.dumps(texts) if is_bulk else texts)
+
+        # For bulk mode, strip whitespace before sending to AI
+        if is_bulk:
+            stripped_texts = [text.strip() for text in texts]
+            content = prompt + json.dumps(stripped_texts)
+        else:
+            content = prompt + texts
 
         try:
             # Get the response text from the provider
             response_text = self._get_provider_response(content)
 
             # Process the response according to bulk mode
             if is_bulk:
-                return self._process_bulk_response(response_text, texts, target_language)
+                return self._process_bulk_response(response_text, texts, target_language, stripped_texts)
             return self.validate_translation(texts, response_text, target_language)
 
         except Exception as e:
@@ -280,67 +290,112 @@ def _get_provider_response(self, content: str) -> str:
             return ""
         return provider_instance.translate(self.config.provider_clients, self.config.model, content)
 
-    def _process_bulk_response(self, response_text: str, original_texts: List[str], target_language: str) -> List[str]:
-        """Process a bulk translation response."""
+    @staticmethod
+    def _fix_json_quotes(json_text: str) -> str:
+        """Fix non-standard quotes in JSON response.
+
+        Args:
+            json_text: JSON text with potentially non-standard quotes
+
+        Returns:
+            JSON text with normalized quotes
+        """
+        quote_fixes = [
+            ('"', '"'),   # Left double quotation mark
+            ('"', '"'),   # Right double quotation mark
+            ('„', '"'),   # Double low-9 quotation mark (Lithuanian, German)
+            ('"', '"'),   # Left double quotation mark (alternative)
+            (''', "'"),   # Left single quotation mark
+            (''', "'"),   # Right single quotation mark
+            ('‚', "'"),   # Single low-9 quotation mark
+            ('«', '"'),   # Left-pointing double angle quotation mark
+            ('»', '"'),   # Right-pointing double angle quotation mark
+            ('‹', "'"),   # Left-pointing single angle quotation mark
+            ('›', "'"),   # Right-pointing single angle quotation mark
+        ]
+
+        fixed_text = json_text
+        for old_quote, new_quote in quote_fixes:
+            fixed_text = fixed_text.replace(old_quote, new_quote)
+
+        # Apply regex fix to handle quotes inside strings
+        fixed_text = re.sub(
+            r'"([^"\\]*(\\.[^"\\]*)*)"',
+            lambda m: f'"{m.group(1).replace(chr(92) + chr(34), chr(34))}"',
+            fixed_text
+        )
+        return fixed_text
+
+    def _extract_translations_from_malformed_json(
+            self,
+            json_text: str,
+            expected_count: int) -> List[str]:
+        """Extract translations from malformed JSON as a fallback.
+
+        Args:
+            json_text: Malformed JSON text
+            expected_count: Expected number of translations
+
+        Returns:
+            List of extracted translations
+
+        Raises:
+            ValueError: If extraction fails or count mismatch
+        """
+        if '[' not in json_text or ']' not in json_text:
+            raise ValueError("No array structure found in malformed JSON")
+
+        # Extract content between first [ and last ]
+        start_idx = json_text.find('[')
+        end_idx = json_text.rfind(']') + 1
+        array_content = json_text[start_idx:end_idx]
+
+        # Try to extract quoted strings
+        matches = re.findall(r'"([^"]*(?:\\.[^"]*)*)"', array_content)
+        if not matches or len(matches) != expected_count:
+            raise ValueError(
+                f"Could not extract expected number of translations "
+                f"(expected {expected_count}, got {len(matches) if matches else 0})"
+            )
+
+        # Unescape the extracted strings
+        return [match.replace('\\"', '"').replace("\\'", "'") for match in matches]
+
+    def _process_bulk_response(
+            self,
+            response_text: str,
+            original_texts: List[str],
+            target_language: str,
+            _stripped_texts: Optional[List[str]] = None) -> List[str]:
+        """Process a bulk translation response.
+
+        Args:
+            response_text: The raw response from the AI provider
+            original_texts: The original texts WITH whitespace
+            target_language: Target language code
+            _stripped_texts: The stripped texts sent to AI (unused, for future use)
+        """
+        # Note: _stripped_texts parameter kept for future validation features
+        # Current validation happens per-entry using original_texts
         try:
-            # Clean the response text for formatting issues
             clean_response = self._clean_json_response(response_text)
             logging.debug("Cleaned JSON response: %s...", clean_response[:100])
 
             # First attempt: try parsing as-is
             try:
                 translated_texts = json.loads(clean_response)
             except json.JSONDecodeError:
-                # Second attempt: fix various quote types that break JSON
-                # First, normalize all quote types to standard quotes
-                # Handle different languages' quotation marks
-                quote_fixes = [
-                    ('"', '"'),   # Left double quotation mark
-                    ('"', '"'),   # Right double quotation mark
-                    ('„', '"'),   # Double low-9 quotation mark (Lithuanian, German)
-                    ('"', '"'),   # Left double quotation mark (alternative)
-                    (''', "'"),   # Left single quotation mark
-                    (''', "'"),   # Right single quotation mark
-                    ('‚', "'"),   # Single low-9 quotation mark
-                    ('«', '"'),   # Left-pointing double angle quotation mark
-                    ('»', '"'),   # Right-pointing double angle quotation mark
-                    ('‹', "'"),   # Left-pointing single angle quotation mark
-                    ('›', "'"),   # Right-pointing single angle quotation mark
-                ]
-
-                fixed_response = clean_response
-                for old_quote, new_quote in quote_fixes:
-                    fixed_response = fixed_response.replace(old_quote, new_quote)
-
-                # Apply fix to all JSON strings (but not the JSON structure quotes)
+                # Second attempt: fix non-standard quotes
+                fixed_response = self._fix_json_quotes(clean_response)
                 try:
-                    # More sophisticated regex to handle quotes inside strings
-                    fixed_response = re.sub(
-                        r'"([^"\\]*(\\.[^"\\]*)*)"',
-                        lambda m: f'"{m.group(1).replace(chr(92) + chr(34), chr(34))}"',
-                        fixed_response)
                     translated_texts = json.loads(fixed_response)
-                except json.JSONDecodeError as e:
-                    # Final attempt: try to extract array elements manually
-                    # This is a fallback for severely malformed JSON
-                    logging.warning("API returned malformed JSON, attempting to extract translations manually")
-
-                    # Try to find array-like structure and extract elements
-                    if '[' in fixed_response and ']' in fixed_response:
-                        # Extract content between first [ and last ]
-                        start_idx = fixed_response.find('[')
-                        end_idx = fixed_response.rfind(']') + 1
-                        array_content = fixed_response[start_idx:end_idx]
-
-                        # Try to extract quoted strings
-                        matches = re.findall(r'"([^"]*(?:\\.[^"]*)*)"', array_content)
-                        if matches and len(matches) == len(original_texts):
-                            # Unescape the extracted strings
-                            translated_texts = [match.replace('\\"', '"').replace("\\'", "'") for match in matches]
-                        else:
-                            raise ValueError("Could not extract expected number of translations") from e
-                    else:
-                        raise
+                except json.JSONDecodeError:
+                    # Final attempt: extract from malformed JSON
+                    logging.warning("API returned malformed JSON, extracting translations manually")
+                    translated_texts = self._extract_translations_from_malformed_json(
+                        fixed_response,
+                        len(original_texts)
+                    )
 
             # Validate the format
             if not isinstance(translated_texts, list) or len(translated_texts) != len(original_texts):
@@ -386,9 +441,19 @@ def _clean_json_response(self, response_text: str) -> str:
 
     def validate_translation(self, original: str, translated: str, target_language: str) -> str:
         """Validates the translation and retries if necessary."""
+        # Extract leading/trailing whitespace from original
+        original_stripped = original.strip()
+        if not original_stripped:
+            # If original is all whitespace, preserve it as-is
+            return original
+
+        leading_ws = original[:len(original) - len(original.lstrip())]
+        trailing_ws = original[len(original.rstrip()):]
+
+        # Strip the translation for validation
         translated = translated.strip()
 
-        if len(translated.split()) > 2 * len(original.split()) + 1:
+        if len(translated.split()) > 2 * len(original_stripped.split()) + 1:
             logging.debug("Translation too verbose (%d words), retrying", len(translated.split()))
             return self.retry_long_translation(original, target_language)
 
@@ -397,10 +462,16 @@ def validate_translation(self, original: str, translated: str, target_language:
             logging.debug("Translation contains explanation, retrying")
             return self.retry_long_translation(original, target_language)
 
-        return translated
+        # Restore original whitespace
+        return leading_ws + translated + trailing_ws
 
     def retry_long_translation(self, text: str, target_language: str) -> str:
         """Retries translation for long or explanatory responses."""
+        # Extract leading/trailing whitespace from original
+        leading_ws = text[:len(text) - len(text.lstrip())]
+        trailing_ws = text[len(text.rstrip()):]
+        text_stripped = text.strip()
+
         prompt = (
             f"Translate this text concisely from English to {target_language}. "
             "Provide only the direct translation without any explanation or additional context. "
@@ -410,15 +481,16 @@ def retry_long_translation(self, text: str, target_language: str) -> str:
         )
 
         try:
-            content = prompt + text
-            retried_translation = self._get_provider_response(content)
+            content = prompt + text_stripped
+            retried_translation = self._get_provider_response(content).strip()
 
-            if len(retried_translation.split()) > 2 * len(text.split()) + 1:
+            if len(retried_translation.split()) > 2 * len(text_stripped.split()) + 1:
                 logging.debug("Retry still too verbose, skipping")
                 return ""  # Return empty string instead of English text
 
             logging.debug("Retry successful")
-            return retried_translation
+            # Restore original whitespace
+            return leading_ws + retried_translation + trailing_ws
 
         except Exception as e:
             logging.debug("Retry failed: %s", str(e)[:100])
@@ -732,6 +804,22 @@ def _prepare_translation_request(self, po_file, po_file_path, file_lang, detail_
         texts = [entry.msgid for entry in entries]
         detail_lang = detail_languages.get(file_lang) if detail_languages else None
 
+        # Check for and warn about whitespace in msgid
+        whitespace_entries = [
+            text for text in texts
+            if text and (text != text.strip())
+        ]
+        if whitespace_entries:
+            logging.warning(
+                "Found %d entries with leading/trailing whitespace in %s. "
+                "Whitespace will be preserved in translations, but ideally should be handled in your UI framework.",
+                len(whitespace_entries),
+                po_file_path
+            )
+            if logging.getLogger().isEnabledFor(logging.DEBUG):
+                for text in whitespace_entries[:3]:  # Show first 3 examples
+                    logging.debug("  Example: %s", repr(text))
+
         return TranslationRequest(
             po_file=po_file,
             entries=entries,
diff --git a/python_gpt_po/tests/test_whitespace_preservation.py b/python_gpt_po/tests/test_whitespace_preservation.py