@@ -206,6 +206,9 @@ def perform_translation_without_validation(
206206 target_language : str ,
207207 detail_language : Optional [str ] = None ) -> str :
208208 """Performs translation without validation for single words or short phrases."""
209+ # Strip text before sending to AI (whitespace will be restored in validate_translation)
210+ text_stripped = text .strip ()
211+
209212 # Use the detailed language name if provided, otherwise use the short code
210213 target_lang_text = detail_language if detail_language else target_language
211214
@@ -216,7 +219,7 @@ def perform_translation_without_validation(
216219 )
217220
218221 return self .validate_translation (text , self .perform_translation (
219- prompt + text , target_language , is_bulk = False , detail_language = detail_language
222+ prompt + text_stripped , target_language , is_bulk = False , detail_language = detail_language
220223 ), target_language )
221224
222225 @staticmethod
@@ -231,6 +234,7 @@ def get_translation_prompt(target_language: str, is_bulk: bool, detail_language:
231234 "Provide only the translations in a JSON array format, maintaining the original order. "
232235 "Each translation should be concise and direct, without explanations or additional context. "
233236 "Keep special characters, placeholders, and formatting intact. "
237+ "Do NOT add or remove any leading/trailing whitespace - translate only the text content. "
234238 "If a term should not be translated (like 'URL' or technical terms), keep it as is. "
235239 "Example format: [\" Translation 1\" , \" Translation 2\" , ...]\n \n "
236240 "Texts to translate:\n "
@@ -253,15 +257,21 @@ def perform_translation(
253257 """Performs the actual translation using the selected provider's API."""
254258 logging .debug ("Translating to '%s' via %s API" , target_language , self .config .provider .value )
255259 prompt = self .get_translation_prompt (target_language , is_bulk , detail_language )
256- content = prompt + (json .dumps (texts ) if is_bulk else texts )
260+
261+ # For bulk mode, strip whitespace before sending to AI
262+ if is_bulk :
263+ stripped_texts = [text .strip () for text in texts ]
264+ content = prompt + json .dumps (stripped_texts )
265+ else :
266+ content = prompt + texts
257267
258268 try :
259269 # Get the response text from the provider
260270 response_text = self ._get_provider_response (content )
261271
262272 # Process the response according to bulk mode
263273 if is_bulk :
264- return self ._process_bulk_response (response_text , texts , target_language )
274+ return self ._process_bulk_response (response_text , texts , target_language , stripped_texts )
265275 return self .validate_translation (texts , response_text , target_language )
266276
267277 except Exception as e :
@@ -280,67 +290,112 @@ def _get_provider_response(self, content: str) -> str:
280290 return ""
281291 return provider_instance .translate (self .config .provider_clients , self .config .model , content )
282292
283- def _process_bulk_response (self , response_text : str , original_texts : List [str ], target_language : str ) -> List [str ]:
284- """Process a bulk translation response."""
293+ @staticmethod
294+ def _fix_json_quotes (json_text : str ) -> str :
295+ """Fix non-standard quotes in JSON response.
296+
297+ Args:
298+ json_text: JSON text with potentially non-standard quotes
299+
300+ Returns:
301+ JSON text with normalized quotes
302+ """
303+ quote_fixes = [
304+ ('"' , '"' ), # Left double quotation mark
305+ ('"' , '"' ), # Right double quotation mark
306+ ('„' , '"' ), # Double low-9 quotation mark (Lithuanian, German)
307+ ('"' , '"' ), # Left double quotation mark (alternative)
308+ (''', "'"), # Left single quotation mark
309+ (''' , "'" ), # Right single quotation mark
310+ ('‚' , "'" ), # Single low-9 quotation mark
311+ ('«' , '"' ), # Left-pointing double angle quotation mark
312+ ('»' , '"' ), # Right-pointing double angle quotation mark
313+ ('‹' , "'" ), # Left-pointing single angle quotation mark
314+ ('›' , "'" ), # Right-pointing single angle quotation mark
315+ ]
316+
317+ fixed_text = json_text
318+ for old_quote , new_quote in quote_fixes :
319+ fixed_text = fixed_text .replace (old_quote , new_quote )
320+
321+ # Apply regex fix to handle quotes inside strings
322+ fixed_text = re .sub (
323+ r'"([^"\\]*(\\.[^"\\]*)*)"' ,
324+ lambda m : f'"{ m .group (1 ).replace (chr (92 ) + chr (34 ), chr (34 ))} "' ,
325+ fixed_text
326+ )
327+ return fixed_text
328+
329+ def _extract_translations_from_malformed_json (
330+ self ,
331+ json_text : str ,
332+ expected_count : int ) -> List [str ]:
333+ """Extract translations from malformed JSON as a fallback.
334+
335+ Args:
336+ json_text: Malformed JSON text
337+ expected_count: Expected number of translations
338+
339+ Returns:
340+ List of extracted translations
341+
342+ Raises:
343+ ValueError: If extraction fails or count mismatch
344+ """
345+ if '[' not in json_text or ']' not in json_text :
346+ raise ValueError ("No array structure found in malformed JSON" )
347+
348+ # Extract content between first [ and last ]
349+ start_idx = json_text .find ('[' )
350+ end_idx = json_text .rfind (']' ) + 1
351+ array_content = json_text [start_idx :end_idx ]
352+
353+ # Try to extract quoted strings
354+ matches = re .findall (r'"([^"]*(?:\\.[^"]*)*)"' , array_content )
355+ if not matches or len (matches ) != expected_count :
356+ raise ValueError (
357+ f"Could not extract expected number of translations "
358+ f"(expected { expected_count } , got { len (matches ) if matches else 0 } )"
359+ )
360+
361+ # Unescape the extracted strings
362+ return [match .replace ('\\ "' , '"' ).replace ("\\ '" , "'" ) for match in matches ]
363+
364+ def _process_bulk_response (
365+ self ,
366+ response_text : str ,
367+ original_texts : List [str ],
368+ target_language : str ,
369+ _stripped_texts : Optional [List [str ]] = None ) -> List [str ]:
370+ """Process a bulk translation response.
371+
372+ Args:
373+ response_text: The raw response from the AI provider
374+ original_texts: The original texts WITH whitespace
375+ target_language: Target language code
376+ _stripped_texts: The stripped texts sent to AI (unused, for future use)
377+ """
378+ # Note: _stripped_texts parameter kept for future validation features
379+ # Current validation happens per-entry using original_texts
285380 try :
286- # Clean the response text for formatting issues
287381 clean_response = self ._clean_json_response (response_text )
288382 logging .debug ("Cleaned JSON response: %s..." , clean_response [:100 ])
289383
290384 # First attempt: try parsing as-is
291385 try :
292386 translated_texts = json .loads (clean_response )
293387 except json .JSONDecodeError :
294- # Second attempt: fix various quote types that break JSON
295- # First, normalize all quote types to standard quotes
296- # Handle different languages' quotation marks
297- quote_fixes = [
298- ('"' , '"' ), # Left double quotation mark
299- ('"' , '"' ), # Right double quotation mark
300- ('„' , '"' ), # Double low-9 quotation mark (Lithuanian, German)
301- ('"' , '"' ), # Left double quotation mark (alternative)
302- (''', "'"), # Left single quotation mark
303- (''' , "'" ), # Right single quotation mark
304- ('‚' , "'" ), # Single low-9 quotation mark
305- ('«' , '"' ), # Left-pointing double angle quotation mark
306- ('»' , '"' ), # Right-pointing double angle quotation mark
307- ('‹' , "'" ), # Left-pointing single angle quotation mark
308- ('›' , "'" ), # Right-pointing single angle quotation mark
309- ]
310-
311- fixed_response = clean_response
312- for old_quote , new_quote in quote_fixes :
313- fixed_response = fixed_response .replace (old_quote , new_quote )
314-
315- # Apply fix to all JSON strings (but not the JSON structure quotes)
388+ # Second attempt: fix non-standard quotes
389+ fixed_response = self ._fix_json_quotes (clean_response )
316390 try :
317- # More sophisticated regex to handle quotes inside strings
318- fixed_response = re .sub (
319- r'"([^"\\]*(\\.[^"\\]*)*)"' ,
320- lambda m : f'"{ m .group (1 ).replace (chr (92 ) + chr (34 ), chr (34 ))} "' ,
321- fixed_response )
322391 translated_texts = json .loads (fixed_response )
323- except json .JSONDecodeError as e :
324- # Final attempt: try to extract array elements manually
325- # This is a fallback for severely malformed JSON
326- logging .warning ("API returned malformed JSON, attempting to extract translations manually" )
327-
328- # Try to find array-like structure and extract elements
329- if '[' in fixed_response and ']' in fixed_response :
330- # Extract content between first [ and last ]
331- start_idx = fixed_response .find ('[' )
332- end_idx = fixed_response .rfind (']' ) + 1
333- array_content = fixed_response [start_idx :end_idx ]
334-
335- # Try to extract quoted strings
336- matches = re .findall (r'"([^"]*(?:\\.[^"]*)*)"' , array_content )
337- if matches and len (matches ) == len (original_texts ):
338- # Unescape the extracted strings
339- translated_texts = [match .replace ('\\ "' , '"' ).replace ("\\ '" , "'" ) for match in matches ]
340- else :
341- raise ValueError ("Could not extract expected number of translations" ) from e
342- else :
343- raise
392+ except json .JSONDecodeError :
393+ # Final attempt: extract from malformed JSON
394+ logging .warning ("API returned malformed JSON, extracting translations manually" )
395+ translated_texts = self ._extract_translations_from_malformed_json (
396+ fixed_response ,
397+ len (original_texts )
398+ )
344399
345400 # Validate the format
346401 if not isinstance (translated_texts , list ) or len (translated_texts ) != len (original_texts ):
@@ -386,9 +441,19 @@ def _clean_json_response(self, response_text: str) -> str:
386441
387442 def validate_translation (self , original : str , translated : str , target_language : str ) -> str :
388443 """Validates the translation and retries if necessary."""
444+ # Extract leading/trailing whitespace from original
445+ original_stripped = original .strip ()
446+ if not original_stripped :
447+ # If original is all whitespace, preserve it as-is
448+ return original
449+
450+ leading_ws = original [:len (original ) - len (original .lstrip ())]
451+ trailing_ws = original [len (original .rstrip ()):]
452+
453+ # Strip the translation for validation
389454 translated = translated .strip ()
390455
391- if len (translated .split ()) > 2 * len (original .split ()) + 1 :
456+ if len (translated .split ()) > 2 * len (original_stripped .split ()) + 1 :
392457 logging .debug ("Translation too verbose (%d words), retrying" , len (translated .split ()))
393458 return self .retry_long_translation (original , target_language )
394459
@@ -397,10 +462,16 @@ def validate_translation(self, original: str, translated: str, target_language:
397462 logging .debug ("Translation contains explanation, retrying" )
398463 return self .retry_long_translation (original , target_language )
399464
400- return translated
465+ # Restore original whitespace
466+ return leading_ws + translated + trailing_ws
401467
402468 def retry_long_translation (self , text : str , target_language : str ) -> str :
403469 """Retries translation for long or explanatory responses."""
470+ # Extract leading/trailing whitespace from original
471+ leading_ws = text [:len (text ) - len (text .lstrip ())]
472+ trailing_ws = text [len (text .rstrip ()):]
473+ text_stripped = text .strip ()
474+
404475 prompt = (
405476 f"Translate this text concisely from English to { target_language } . "
406477 "Provide only the direct translation without any explanation or additional context. "
@@ -410,15 +481,16 @@ def retry_long_translation(self, text: str, target_language: str) -> str:
410481 )
411482
412483 try :
413- content = prompt + text
414- retried_translation = self ._get_provider_response (content )
484+ content = prompt + text_stripped
485+ retried_translation = self ._get_provider_response (content ). strip ()
415486
416- if len (retried_translation .split ()) > 2 * len (text .split ()) + 1 :
487+ if len (retried_translation .split ()) > 2 * len (text_stripped .split ()) + 1 :
417488 logging .debug ("Retry still too verbose, skipping" )
418489 return "" # Return empty string instead of English text
419490
420491 logging .debug ("Retry successful" )
421- return retried_translation
492+ # Restore original whitespace
493+ return leading_ws + retried_translation + trailing_ws
422494
423495 except Exception as e :
424496 logging .debug ("Retry failed: %s" , str (e )[:100 ])
@@ -732,6 +804,22 @@ def _prepare_translation_request(self, po_file, po_file_path, file_lang, detail_
732804 texts = [entry .msgid for entry in entries ]
733805 detail_lang = detail_languages .get (file_lang ) if detail_languages else None
734806
807+ # Check for and warn about whitespace in msgid
808+ whitespace_entries = [
809+ text for text in texts
810+ if text and (text != text .strip ())
811+ ]
812+ if whitespace_entries :
813+ logging .warning (
814+ "Found %d entries with leading/trailing whitespace in %s. "
815+ "Whitespace will be preserved in translations, but ideally should be handled in your UI framework." ,
816+ len (whitespace_entries ),
817+ po_file_path
818+ )
819+ if logging .getLogger ().isEnabledFor (logging .DEBUG ):
820+ for text in whitespace_entries [:3 ]: # Show first 3 examples
821+ logging .debug (" Example: %s" , repr (text ))
822+
735823 return TranslationRequest (
736824 po_file = po_file ,
737825 entries = entries ,
0 commit comments