|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import json |
| 4 | +import re |
| 5 | +import subprocess |
| 6 | +import sys |
| 7 | +from pathlib import Path |
| 8 | +from typing import Dict, Any, Set, Optional, List |
| 9 | + |
| 10 | +BATCH_SIZE = 50 |
| 11 | +LLM_MODEL = 'github/gpt-4o' |
| 12 | +SOURCE_LANGUAGE = 'en_US' |
| 13 | + |
| 14 | + |
| 15 | +def load_json(file_path: Path) -> Dict[str, Any]: |
| 16 | + """Load and parse a JSON file.""" |
| 17 | + try: |
| 18 | + with open(file_path, 'r', encoding='utf-8') as f: |
| 19 | + return json.load(f) |
| 20 | + except (FileNotFoundError, json.JSONDecodeError) as e: |
| 21 | + print(f"Error loading {file_path}: {e}", flush=True) |
| 22 | + raise |
| 23 | + |
| 24 | + |
| 25 | +def save_json(file_path: Path, data: Dict[str, Any]) -> None: |
| 26 | + """Save data to a JSON file with proper formatting.""" |
| 27 | + try: |
| 28 | + file_path.parent.mkdir(parents=True, exist_ok=True) |
| 29 | + with open(file_path, 'w', encoding='utf-8') as f: |
| 30 | + json.dump(data, f, ensure_ascii=False, indent=2) |
| 31 | + f.write('\n') |
| 32 | + except Exception as e: |
| 33 | + print(f"Error saving {file_path}: {e}", flush=True) |
| 34 | + raise |
| 35 | + |
| 36 | + |
| 37 | +def get_changed_keys(en_file: Path) -> Set[str]: |
| 38 | + """Extract changed keys from git diff of the English localization file.""" |
| 39 | + print("Getting git diff...", flush=True) |
| 40 | + |
| 41 | + try: |
| 42 | + result = subprocess.run( |
| 43 | + ['git', 'diff', 'HEAD~1', 'HEAD', '--', str(en_file)], |
| 44 | + capture_output=True, |
| 45 | + text=True, |
| 46 | + check=False, |
| 47 | + cwd=en_file.parent.parent |
| 48 | + ) |
| 49 | + |
| 50 | + print(f"Git diff return code: {result.returncode}", flush=True) |
| 51 | + |
| 52 | + if result.returncode != 0: |
| 53 | + print(f"Git diff error: {result.stderr}", flush=True) |
| 54 | + sys.exit(1) |
| 55 | + |
| 56 | + if not result.stdout.strip(): |
| 57 | + print("No diff found - file unchanged", flush=True) |
| 58 | + return set() |
| 59 | + |
| 60 | + # Parse diff output to extract changed keys using regex for better accuracy |
| 61 | + changed_keys = set() |
| 62 | + # Match lines like: + "key": "value" |
| 63 | + pattern = re.compile(r'^\+\s*"([^"]+)"\s*:', re.MULTILINE) |
| 64 | + |
| 65 | + for match in pattern.finditer(result.stdout): |
| 66 | + key = match.group(1) |
| 67 | + changed_keys.add(key) |
| 68 | + |
| 69 | + return changed_keys |
| 70 | + |
| 71 | + except subprocess.TimeoutExpired: |
| 72 | + print("Git diff timed out", flush=True) |
| 73 | + sys.exit(1) |
| 74 | + except Exception as e: |
| 75 | + print(f"Exception in get_changed_keys: {e}", flush=True) |
| 76 | + sys.exit(1) |
| 77 | + |
| 78 | + |
| 79 | +def strip_markdown_code_block(content: str) -> str: |
| 80 | + """Remove markdown code block formatting from LLM response.""" |
| 81 | + content = content.strip() |
| 82 | + |
| 83 | + if content.startswith('```'): |
| 84 | + # Remove opening ```json or ``` |
| 85 | + lines = content.split('\n') |
| 86 | + if lines[0].strip() in ('```json', '```'): |
| 87 | + lines = lines[1:] |
| 88 | + |
| 89 | + # Remove closing ``` |
| 90 | + if lines and lines[-1].strip() == '```': |
| 91 | + lines = lines[:-1] |
| 92 | + |
| 93 | + content = '\n'.join(lines).strip() |
| 94 | + |
| 95 | + return content |
| 96 | + |
| 97 | + |
| 98 | +def call_llm(prompt: str) -> Optional[str]: |
| 99 | + """Call the LLM with the given prompt and return the response.""" |
| 100 | + try: |
| 101 | + process = subprocess.Popen( |
| 102 | + ['llm', '-m', LLM_MODEL], |
| 103 | + stdin=subprocess.PIPE, |
| 104 | + stdout=subprocess.PIPE, |
| 105 | + stderr=subprocess.PIPE, |
| 106 | + text=True |
| 107 | + ) |
| 108 | + |
| 109 | + stdout, stderr = process.communicate(input=prompt, timeout=300) |
| 110 | + |
| 111 | + if process.returncode != 0: |
| 112 | + print(f"LLM error: {stderr}", flush=True) |
| 113 | + return None |
| 114 | + |
| 115 | + return stdout.strip() if stdout.strip() else None |
| 116 | + |
| 117 | + except subprocess.TimeoutExpired: |
| 118 | + print("LLM call timed out", flush=True) |
| 119 | + process.kill() |
| 120 | + return None |
| 121 | + except Exception as e: |
| 122 | + print(f"Exception calling LLM: {e}", flush=True) |
| 123 | + return None |
| 124 | + |
| 125 | + |
| 126 | +def build_translation_prompt( |
| 127 | + keys_dict: Dict[str, str], |
| 128 | + target_language: str, |
| 129 | + full_en_data: Dict[str, str], |
| 130 | + existing_target_data: Dict[str, str] |
| 131 | +) -> str: |
| 132 | + """Build the translation prompt for the LLM.""" |
| 133 | + return f"""You are a professional translator working on localization for Harmonoid, a music player application. Translate the following JSON object from English to {target_language}. |
| 134 | +
|
| 135 | +CONTEXT: These strings are UI text for a music player app. They include terms related to music playback, playlists, albums, artists, audio settings, and media library management. |
| 136 | +
|
| 137 | +FULL ENGLISH LOCALIZATION (all strings for reference): |
| 138 | +{json.dumps(full_en_data, ensure_ascii=False, indent=2)} |
| 139 | +
|
| 140 | +EXISTING {target_language.upper()} TRANSLATIONS (for consistency reference): |
| 141 | +{json.dumps(existing_target_data, ensure_ascii=False, indent=2)} |
| 142 | +
|
| 143 | +IMPORTANT RULES: |
| 144 | +1. Keep all JSON keys EXACTLY the same (do not translate keys) |
| 145 | +2. Only translate the VALUES |
| 146 | +3. Preserve any special formatting like quotes (""), placeholders ("M", "N", "X", "ENTRY", "PLAYLIST", etc.) |
| 147 | +4. Maintain the same meaning, punctuation, capitalization, structure and formatting as the English source |
| 148 | +5. Use appropriate music/audio terminology for the target language |
| 149 | +6. Maintain CONSISTENCY with the existing translations shown above - use the same style, tone, and terminology choices |
| 150 | +7. For technical terms (e.g., "playlist", "equalizer"), check if they were translated or kept in English in existing translations and follow the same pattern |
| 151 | +8. Return ONLY the translated JSON object, no additional text or explanations |
| 152 | +9. Ensure the output is valid JSON |
| 153 | +10. Try to keep similar string length as the original English string (if possible and natural in the target language) |
| 154 | +
|
| 155 | +STRINGS TO TRANSLATE: |
| 156 | +{json.dumps(keys_dict, ensure_ascii=False, indent=2)}""" |
| 157 | + |
| 158 | + |
| 159 | +def translate_keys( |
| 160 | + keys_dict: Dict[str, str], |
| 161 | + target_language: str, |
| 162 | + full_en_data: Dict[str, str], |
| 163 | + existing_target_data: Dict[str, str] |
| 164 | +) -> Dict[str, str]: |
| 165 | + """Translate a dictionary of keys using LLM.""" |
| 166 | + if not keys_dict: |
| 167 | + return {} |
| 168 | + |
| 169 | + print(f"Calling LLM...", flush=True) |
| 170 | + |
| 171 | + prompt = build_translation_prompt(keys_dict, target_language, full_en_data, existing_target_data) |
| 172 | + response = call_llm(prompt) |
| 173 | + |
| 174 | + if not response: |
| 175 | + print("Empty or failed LLM response, returning original keys", flush=True) |
| 176 | + return keys_dict |
| 177 | + |
| 178 | + print(f"LLM returned successfully", flush=True) |
| 179 | + |
| 180 | + # Strip markdown formatting |
| 181 | + content = strip_markdown_code_block(response) |
| 182 | + |
| 183 | + # Parse JSON response |
| 184 | + try: |
| 185 | + translated = json.loads(content) |
| 186 | + |
| 187 | + # Validate that all keys are present |
| 188 | + if not isinstance(translated, dict): |
| 189 | + print("LLM response is not a dictionary", flush=True) |
| 190 | + return keys_dict |
| 191 | + |
| 192 | + missing_keys = set(keys_dict.keys()) - set(translated.keys()) |
| 193 | + if missing_keys: |
| 194 | + print(f"Warning: Missing keys in translation: {missing_keys}", flush=True) |
| 195 | + # Fill in missing keys with original values |
| 196 | + for key in missing_keys: |
| 197 | + translated[key] = keys_dict[key] |
| 198 | + |
| 199 | + return translated |
| 200 | + |
| 201 | + except json.JSONDecodeError as e: |
| 202 | + print(f"JSON decode error: {e}", flush=True) |
| 203 | + print(f"Content preview: {content[:500]}...", flush=True) |
| 204 | + return keys_dict |
| 205 | + |
| 206 | + |
| 207 | +def translate_language( |
| 208 | + lang_code: str, |
| 209 | + lang_name: str, |
| 210 | + keys_to_translate: Dict[str, str], |
| 211 | + en_data: Dict[str, str], |
| 212 | + existing_data: Dict[str, str], |
| 213 | + localizations_dir: Path |
| 214 | +) -> bool: |
| 215 | + """Translate all keys for a specific language.""" |
| 216 | + if not keys_to_translate: |
| 217 | + print("Up to date", flush=True) |
| 218 | + return False |
| 219 | + |
| 220 | + print(f"Translating {len(keys_to_translate)} keys...", flush=True) |
| 221 | + |
| 222 | + # Translate in batches |
| 223 | + translated = {} |
| 224 | + keys = list(keys_to_translate.keys()) |
| 225 | + total_batches = (len(keys) + BATCH_SIZE - 1) // BATCH_SIZE |
| 226 | + |
| 227 | + for i in range(0, len(keys), BATCH_SIZE): |
| 228 | + batch_keys = keys[i:i + BATCH_SIZE] |
| 229 | + batch_dict = {k: keys_to_translate[k] for k in batch_keys} |
| 230 | + |
| 231 | + batch_num = i // BATCH_SIZE + 1 |
| 232 | + print(f"Batch {batch_num}/{total_batches} ({len(batch_keys)} keys)", flush=True) |
| 233 | + |
| 234 | + batch_translated = translate_keys(batch_dict, lang_name, en_data, existing_data) |
| 235 | + translated.update(batch_translated) |
| 236 | + |
| 237 | + # Merge translations with existing data and maintain key order from en_US.json |
| 238 | + final_data = {**existing_data, **translated} |
| 239 | + ordered_data = {k: final_data.get(k, en_data[k]) for k in en_data.keys()} |
| 240 | + |
| 241 | + # Save the updated translations |
| 242 | + target_file = localizations_dir / f"{lang_code}.json" |
| 243 | + save_json(target_file, ordered_data) |
| 244 | + print(f"✓ Saved to {target_file.name}", flush=True) |
| 245 | + |
| 246 | + return True |
| 247 | + |
| 248 | + |
| 249 | +def main() -> None: |
| 250 | + """Main entry point for the translation script.""" |
| 251 | + print("Starting translation script...", flush=True) |
| 252 | + |
| 253 | + # Setup paths |
| 254 | + script_dir = Path(__file__).parent |
| 255 | + project_root = script_dir.parent.parent |
| 256 | + localizations_dir = project_root / "localizations" |
| 257 | + index_file = project_root / "index.json" |
| 258 | + en_file = localizations_dir / f"{SOURCE_LANGUAGE}.json" |
| 259 | + |
| 260 | + print(f"Paths:", flush=True) |
| 261 | + print(f" project_root: {project_root}", flush=True) |
| 262 | + print(f" en_file: {en_file}", flush=True) |
| 263 | + |
| 264 | + # Validate English localization file exists |
| 265 | + if not en_file.exists(): |
| 266 | + print(f"Error: {en_file} not found", flush=True) |
| 267 | + sys.exit(1) |
| 268 | + |
| 269 | + # Load English localization file |
| 270 | + try: |
| 271 | + en_data = load_json(en_file) |
| 272 | + print(f"Loaded {len(en_data)} keys from {SOURCE_LANGUAGE}.json", flush=True) |
| 273 | + except Exception: |
| 274 | + sys.exit(1) |
| 275 | + |
| 276 | + # Get keys that were changed in the latest commit |
| 277 | + changed_keys = get_changed_keys(en_file) |
| 278 | + |
| 279 | + if not changed_keys: |
| 280 | + print("No changed keys found - nothing to translate", flush=True) |
| 281 | + sys.exit(0) |
| 282 | + |
| 283 | + print(f"Found {len(changed_keys)} changed keys: {', '.join(sorted(changed_keys))}", flush=True) |
| 284 | + |
| 285 | + # Load list of available languages from index.json |
| 286 | + if not index_file.exists(): |
| 287 | + print(f"Error: {index_file} not found", flush=True) |
| 288 | + sys.exit(1) |
| 289 | + |
| 290 | + try: |
| 291 | + languages = load_json(index_file) |
| 292 | + print(f"Loaded {len(languages)} languages", flush=True) |
| 293 | + except Exception: |
| 294 | + sys.exit(1) |
| 295 | + |
| 296 | + # Translate changed keys for each language |
| 297 | + translated_count = 0 |
| 298 | + |
| 299 | + for lang_info in languages: |
| 300 | + lang_code = lang_info.get('code') |
| 301 | + lang_name = lang_info.get('name') |
| 302 | + |
| 303 | + if not lang_code or not lang_name: |
| 304 | + print(f"Warning: Invalid language entry: {lang_info}", flush=True) |
| 305 | + continue |
| 306 | + |
| 307 | + # Skip English since it's the source language |
| 308 | + if lang_code == SOURCE_LANGUAGE: |
| 309 | + continue |
| 310 | + |
| 311 | + print(f"\n[{lang_code}] {lang_name}", flush=True) |
| 312 | + |
| 313 | + # Load existing translations for this language |
| 314 | + target_file = localizations_dir / f"{lang_code}.json" |
| 315 | + existing_data = load_json(target_file) if target_file.exists() else {} |
| 316 | + |
| 317 | + # Filter to only keys that need translation |
| 318 | + keys_to_translate = {k: en_data[k] for k in changed_keys if k in en_data} |
| 319 | + |
| 320 | + # Translate the language |
| 321 | + if translate_language(lang_code, lang_name, keys_to_translate, en_data, existing_data, localizations_dir): |
| 322 | + translated_count += 1 |
| 323 | + |
| 324 | + print(f"\n✓ Done - translated {translated_count} language(s)", flush=True) |
| 325 | + |
| 326 | + |
| 327 | +if __name__ == "__main__": |
| 328 | + main() |
0 commit comments