Skip to content

Commit 2dd98aa

Browse files
author
bram
committed
Merge branch 'main' into feature_added-ollama
2 parents 9c6b013 + 3985e05 commit 2dd98aa

File tree

4 files changed

+335
-69
lines changed

4 files changed

+335
-69
lines changed

README.md

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,20 +73,22 @@ export AZURE_OPENAI_API_VERSION='2024-02-01'
7373
### Or Use Ollama (Local, No API Key Needed)
7474

7575
```bash
76-
# Install Ollama
76+
# 1. Install Ollama
7777
curl -fsSL https://ollama.com/install.sh | sh
7878

79-
# Pull a model (qwen2.5 recommended for multilingual)
80-
ollama pull qwen2.5
79+
# 2. Pull a model
80+
ollama pull qwen2.5 # Best for multilingual (Arabic, Chinese, etc.)
81+
# OR
82+
ollama pull llama3.2 # Fast for European languages
8183

82-
# Use it (no API key required!)
84+
# 3. Translate (no API key required!)
8385
gpt-po-translator --provider ollama --folder ./locales
8486

85-
# For non-Latin scripts (Arabic, Chinese, etc.), omit --bulk for better quality
87+
# For non-Latin scripts, use qwen2.5 WITHOUT --bulk
8688
gpt-po-translator --provider ollama --model qwen2.5 --folder ./locales --lang ar
8789
```
8890

89-
> **💡 Tip:** For Ollama with non-Latin languages, **don't use `--bulk`** mode. Single-item translation produces better results since the model doesn't need to handle JSON formatting.
91+
> **💡 Important:** For Ollama with **non-Latin languages** (Arabic, Chinese, Japanese, etc.), **omit the `--bulk` flag**. Single-item translation is more reliable because the model doesn't have to format responses as JSON.
9092
9193
## 💡 Usage Examples
9294

docs/usage.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -426,8 +426,6 @@ gpt-po-translator --folder ./locales --lang de --no-ai-comment
426426

427427
---
428428

429-
<<<<<<< Updated upstream
430-
=======
431429
## Using Ollama (Local AI Provider)
432430

433431
### Overview
@@ -806,7 +804,6 @@ Both modes use the same preservation logic, ensuring consistent behavior.
806804

807805
---
808806

809-
>>>>>>> Stashed changes
810807
## Behind the Scenes: API Calls and Post-Processing
811808

812809
- **Provider-Specific API Calls:**

python_gpt_po/services/translation_service.py

Lines changed: 148 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ def perform_translation_without_validation(
206206
target_language: str,
207207
detail_language: Optional[str] = None) -> str:
208208
"""Performs translation without validation for single words or short phrases."""
209+
# Strip text before sending to AI (whitespace will be restored in validate_translation)
210+
text_stripped = text.strip()
211+
209212
# Use the detailed language name if provided, otherwise use the short code
210213
target_lang_text = detail_language if detail_language else target_language
211214

@@ -216,7 +219,7 @@ def perform_translation_without_validation(
216219
)
217220

218221
return self.validate_translation(text, self.perform_translation(
219-
prompt + text, target_language, is_bulk=False, detail_language=detail_language
222+
prompt + text_stripped, target_language, is_bulk=False, detail_language=detail_language
220223
), target_language)
221224

222225
@staticmethod
@@ -231,6 +234,7 @@ def get_translation_prompt(target_language: str, is_bulk: bool, detail_language:
231234
"Provide only the translations in a JSON array format, maintaining the original order. "
232235
"Each translation should be concise and direct, without explanations or additional context. "
233236
"Keep special characters, placeholders, and formatting intact. "
237+
"Do NOT add or remove any leading/trailing whitespace - translate only the text content. "
234238
"If a term should not be translated (like 'URL' or technical terms), keep it as is. "
235239
"Example format: [\"Translation 1\", \"Translation 2\", ...]\n\n"
236240
"Texts to translate:\n"
@@ -253,15 +257,21 @@ def perform_translation(
253257
"""Performs the actual translation using the selected provider's API."""
254258
logging.debug("Translating to '%s' via %s API", target_language, self.config.provider.value)
255259
prompt = self.get_translation_prompt(target_language, is_bulk, detail_language)
256-
content = prompt + (json.dumps(texts) if is_bulk else texts)
260+
261+
# For bulk mode, strip whitespace before sending to AI
262+
if is_bulk:
263+
stripped_texts = [text.strip() for text in texts]
264+
content = prompt + json.dumps(stripped_texts)
265+
else:
266+
content = prompt + texts
257267

258268
try:
259269
# Get the response text from the provider
260270
response_text = self._get_provider_response(content)
261271

262272
# Process the response according to bulk mode
263273
if is_bulk:
264-
return self._process_bulk_response(response_text, texts, target_language)
274+
return self._process_bulk_response(response_text, texts, target_language, stripped_texts)
265275
return self.validate_translation(texts, response_text, target_language)
266276

267277
except Exception as e:
@@ -280,67 +290,112 @@ def _get_provider_response(self, content: str) -> str:
280290
return ""
281291
return provider_instance.translate(self.config.provider_clients, self.config.model, content)
282292

283-
def _process_bulk_response(self, response_text: str, original_texts: List[str], target_language: str) -> List[str]:
284-
"""Process a bulk translation response."""
293+
@staticmethod
294+
def _fix_json_quotes(json_text: str) -> str:
295+
"""Fix non-standard quotes in JSON response.
296+
297+
Args:
298+
json_text: JSON text with potentially non-standard quotes
299+
300+
Returns:
301+
JSON text with normalized quotes
302+
"""
303+
quote_fixes = [
304+
('"', '"'), # Left double quotation mark
305+
('"', '"'), # Right double quotation mark
306+
('„', '"'), # Double low-9 quotation mark (Lithuanian, German)
307+
('"', '"'), # Left double quotation mark (alternative)
308+
(''', "'"), # Left single quotation mark
309+
(''', "'"), # Right single quotation mark
310+
('‚', "'"), # Single low-9 quotation mark
311+
('«', '"'), # Left-pointing double angle quotation mark
312+
('»', '"'), # Right-pointing double angle quotation mark
313+
('‹', "'"), # Left-pointing single angle quotation mark
314+
('›', "'"), # Right-pointing single angle quotation mark
315+
]
316+
317+
fixed_text = json_text
318+
for old_quote, new_quote in quote_fixes:
319+
fixed_text = fixed_text.replace(old_quote, new_quote)
320+
321+
# Apply regex fix to handle quotes inside strings
322+
fixed_text = re.sub(
323+
r'"([^"\\]*(\\.[^"\\]*)*)"',
324+
lambda m: f'"{m.group(1).replace(chr(92) + chr(34), chr(34))}"',
325+
fixed_text
326+
)
327+
return fixed_text
328+
329+
def _extract_translations_from_malformed_json(
330+
self,
331+
json_text: str,
332+
expected_count: int) -> List[str]:
333+
"""Extract translations from malformed JSON as a fallback.
334+
335+
Args:
336+
json_text: Malformed JSON text
337+
expected_count: Expected number of translations
338+
339+
Returns:
340+
List of extracted translations
341+
342+
Raises:
343+
ValueError: If extraction fails or count mismatch
344+
"""
345+
if '[' not in json_text or ']' not in json_text:
346+
raise ValueError("No array structure found in malformed JSON")
347+
348+
# Extract content between first [ and last ]
349+
start_idx = json_text.find('[')
350+
end_idx = json_text.rfind(']') + 1
351+
array_content = json_text[start_idx:end_idx]
352+
353+
# Try to extract quoted strings
354+
matches = re.findall(r'"([^"]*(?:\\.[^"]*)*)"', array_content)
355+
if not matches or len(matches) != expected_count:
356+
raise ValueError(
357+
f"Could not extract expected number of translations "
358+
f"(expected {expected_count}, got {len(matches) if matches else 0})"
359+
)
360+
361+
# Unescape the extracted strings
362+
return [match.replace('\\"', '"').replace("\\'", "'") for match in matches]
363+
364+
def _process_bulk_response(
365+
self,
366+
response_text: str,
367+
original_texts: List[str],
368+
target_language: str,
369+
_stripped_texts: Optional[List[str]] = None) -> List[str]:
370+
"""Process a bulk translation response.
371+
372+
Args:
373+
response_text: The raw response from the AI provider
374+
original_texts: The original texts WITH whitespace
375+
target_language: Target language code
376+
_stripped_texts: The stripped texts sent to AI (unused, for future use)
377+
"""
378+
# Note: _stripped_texts parameter kept for future validation features
379+
# Current validation happens per-entry using original_texts
285380
try:
286-
# Clean the response text for formatting issues
287381
clean_response = self._clean_json_response(response_text)
288382
logging.debug("Cleaned JSON response: %s...", clean_response[:100])
289383

290384
# First attempt: try parsing as-is
291385
try:
292386
translated_texts = json.loads(clean_response)
293387
except json.JSONDecodeError:
294-
# Second attempt: fix various quote types that break JSON
295-
# First, normalize all quote types to standard quotes
296-
# Handle different languages' quotation marks
297-
quote_fixes = [
298-
('"', '"'), # Left double quotation mark
299-
('"', '"'), # Right double quotation mark
300-
('„', '"'), # Double low-9 quotation mark (Lithuanian, German)
301-
('"', '"'), # Left double quotation mark (alternative)
302-
(''', "'"), # Left single quotation mark
303-
(''', "'"), # Right single quotation mark
304-
('‚', "'"), # Single low-9 quotation mark
305-
('«', '"'), # Left-pointing double angle quotation mark
306-
('»', '"'), # Right-pointing double angle quotation mark
307-
('‹', "'"), # Left-pointing single angle quotation mark
308-
('›', "'"), # Right-pointing single angle quotation mark
309-
]
310-
311-
fixed_response = clean_response
312-
for old_quote, new_quote in quote_fixes:
313-
fixed_response = fixed_response.replace(old_quote, new_quote)
314-
315-
# Apply fix to all JSON strings (but not the JSON structure quotes)
388+
# Second attempt: fix non-standard quotes
389+
fixed_response = self._fix_json_quotes(clean_response)
316390
try:
317-
# More sophisticated regex to handle quotes inside strings
318-
fixed_response = re.sub(
319-
r'"([^"\\]*(\\.[^"\\]*)*)"',
320-
lambda m: f'"{m.group(1).replace(chr(92) + chr(34), chr(34))}"',
321-
fixed_response)
322391
translated_texts = json.loads(fixed_response)
323-
except json.JSONDecodeError as e:
324-
# Final attempt: try to extract array elements manually
325-
# This is a fallback for severely malformed JSON
326-
logging.warning("API returned malformed JSON, attempting to extract translations manually")
327-
328-
# Try to find array-like structure and extract elements
329-
if '[' in fixed_response and ']' in fixed_response:
330-
# Extract content between first [ and last ]
331-
start_idx = fixed_response.find('[')
332-
end_idx = fixed_response.rfind(']') + 1
333-
array_content = fixed_response[start_idx:end_idx]
334-
335-
# Try to extract quoted strings
336-
matches = re.findall(r'"([^"]*(?:\\.[^"]*)*)"', array_content)
337-
if matches and len(matches) == len(original_texts):
338-
# Unescape the extracted strings
339-
translated_texts = [match.replace('\\"', '"').replace("\\'", "'") for match in matches]
340-
else:
341-
raise ValueError("Could not extract expected number of translations") from e
342-
else:
343-
raise
392+
except json.JSONDecodeError:
393+
# Final attempt: extract from malformed JSON
394+
logging.warning("API returned malformed JSON, extracting translations manually")
395+
translated_texts = self._extract_translations_from_malformed_json(
396+
fixed_response,
397+
len(original_texts)
398+
)
344399

345400
# Validate the format
346401
if not isinstance(translated_texts, list) or len(translated_texts) != len(original_texts):
@@ -386,9 +441,19 @@ def _clean_json_response(self, response_text: str) -> str:
386441

387442
def validate_translation(self, original: str, translated: str, target_language: str) -> str:
388443
"""Validates the translation and retries if necessary."""
444+
# Extract leading/trailing whitespace from original
445+
original_stripped = original.strip()
446+
if not original_stripped:
447+
# If original is all whitespace, preserve it as-is
448+
return original
449+
450+
leading_ws = original[:len(original) - len(original.lstrip())]
451+
trailing_ws = original[len(original.rstrip()):]
452+
453+
# Strip the translation for validation
389454
translated = translated.strip()
390455

391-
if len(translated.split()) > 2 * len(original.split()) + 1:
456+
if len(translated.split()) > 2 * len(original_stripped.split()) + 1:
392457
logging.debug("Translation too verbose (%d words), retrying", len(translated.split()))
393458
return self.retry_long_translation(original, target_language)
394459

@@ -397,10 +462,16 @@ def validate_translation(self, original: str, translated: str, target_language:
397462
logging.debug("Translation contains explanation, retrying")
398463
return self.retry_long_translation(original, target_language)
399464

400-
return translated
465+
# Restore original whitespace
466+
return leading_ws + translated + trailing_ws
401467

402468
def retry_long_translation(self, text: str, target_language: str) -> str:
403469
"""Retries translation for long or explanatory responses."""
470+
# Extract leading/trailing whitespace from original
471+
leading_ws = text[:len(text) - len(text.lstrip())]
472+
trailing_ws = text[len(text.rstrip()):]
473+
text_stripped = text.strip()
474+
404475
prompt = (
405476
f"Translate this text concisely from English to {target_language}. "
406477
"Provide only the direct translation without any explanation or additional context. "
@@ -410,15 +481,16 @@ def retry_long_translation(self, text: str, target_language: str) -> str:
410481
)
411482

412483
try:
413-
content = prompt + text
414-
retried_translation = self._get_provider_response(content)
484+
content = prompt + text_stripped
485+
retried_translation = self._get_provider_response(content).strip()
415486

416-
if len(retried_translation.split()) > 2 * len(text.split()) + 1:
487+
if len(retried_translation.split()) > 2 * len(text_stripped.split()) + 1:
417488
logging.debug("Retry still too verbose, skipping")
418489
return "" # Return empty string instead of English text
419490

420491
logging.debug("Retry successful")
421-
return retried_translation
492+
# Restore original whitespace
493+
return leading_ws + retried_translation + trailing_ws
422494

423495
except Exception as e:
424496
logging.debug("Retry failed: %s", str(e)[:100])
@@ -732,6 +804,22 @@ def _prepare_translation_request(self, po_file, po_file_path, file_lang, detail_
732804
texts = [entry.msgid for entry in entries]
733805
detail_lang = detail_languages.get(file_lang) if detail_languages else None
734806

807+
# Check for and warn about whitespace in msgid
808+
whitespace_entries = [
809+
text for text in texts
810+
if text and (text != text.strip())
811+
]
812+
if whitespace_entries:
813+
logging.warning(
814+
"Found %d entries with leading/trailing whitespace in %s. "
815+
"Whitespace will be preserved in translations, but ideally should be handled in your UI framework.",
816+
len(whitespace_entries),
817+
po_file_path
818+
)
819+
if logging.getLogger().isEnabledFor(logging.DEBUG):
820+
for text in whitespace_entries[:3]: # Show first 3 examples
821+
logging.debug(" Example: %s", repr(text))
822+
735823
return TranslationRequest(
736824
po_file=po_file,
737825
entries=entries,

0 commit comments

Comments
 (0)