Skip to content

Commit fab1b42

Browse files
committed
feat: setup translations using LLM
1 parent fa0947d commit fab1b42

23 files changed

+1271
-861
lines changed

.github/scripts/translate.py

Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
#!/usr/bin/env python3
2+
3+
import json
4+
import re
5+
import subprocess
6+
import sys
7+
from pathlib import Path
8+
from typing import Dict, Any, Set, Optional, List
9+
10+
BATCH_SIZE = 50
11+
LLM_MODEL = 'github/gpt-4o'
12+
SOURCE_LANGUAGE = 'en_US'
13+
14+
15+
def load_json(file_path: Path) -> Dict[str, Any]:
16+
"""Load and parse a JSON file."""
17+
try:
18+
with open(file_path, 'r', encoding='utf-8') as f:
19+
return json.load(f)
20+
except (FileNotFoundError, json.JSONDecodeError) as e:
21+
print(f"Error loading {file_path}: {e}", flush=True)
22+
raise
23+
24+
25+
def save_json(file_path: Path, data: Dict[str, Any]) -> None:
26+
"""Save data to a JSON file with proper formatting."""
27+
try:
28+
file_path.parent.mkdir(parents=True, exist_ok=True)
29+
with open(file_path, 'w', encoding='utf-8') as f:
30+
json.dump(data, f, ensure_ascii=False, indent=2)
31+
f.write('\n')
32+
except Exception as e:
33+
print(f"Error saving {file_path}: {e}", flush=True)
34+
raise
35+
36+
37+
def get_changed_keys(en_file: Path) -> Set[str]:
38+
"""Extract changed keys from git diff of the English localization file."""
39+
print("Getting git diff...", flush=True)
40+
41+
try:
42+
result = subprocess.run(
43+
['git', 'diff', 'HEAD~1', 'HEAD', '--', str(en_file)],
44+
capture_output=True,
45+
text=True,
46+
check=False,
47+
cwd=en_file.parent.parent
48+
)
49+
50+
print(f"Git diff return code: {result.returncode}", flush=True)
51+
52+
if result.returncode != 0:
53+
print(f"Git diff error: {result.stderr}", flush=True)
54+
sys.exit(1)
55+
56+
if not result.stdout.strip():
57+
print("No diff found - file unchanged", flush=True)
58+
return set()
59+
60+
# Parse diff output to extract changed keys using regex for better accuracy
61+
changed_keys = set()
62+
# Match lines like: + "key": "value"
63+
pattern = re.compile(r'^\+\s*"([^"]+)"\s*:', re.MULTILINE)
64+
65+
for match in pattern.finditer(result.stdout):
66+
key = match.group(1)
67+
changed_keys.add(key)
68+
69+
return changed_keys
70+
71+
except subprocess.TimeoutExpired:
72+
print("Git diff timed out", flush=True)
73+
sys.exit(1)
74+
except Exception as e:
75+
print(f"Exception in get_changed_keys: {e}", flush=True)
76+
sys.exit(1)
77+
78+
79+
def strip_markdown_code_block(content: str) -> str:
80+
"""Remove markdown code block formatting from LLM response."""
81+
content = content.strip()
82+
83+
if content.startswith('```'):
84+
# Remove opening ```json or ```
85+
lines = content.split('\n')
86+
if lines[0].strip() in ('```json', '```'):
87+
lines = lines[1:]
88+
89+
# Remove closing ```
90+
if lines and lines[-1].strip() == '```':
91+
lines = lines[:-1]
92+
93+
content = '\n'.join(lines).strip()
94+
95+
return content
96+
97+
98+
def call_llm(prompt: str) -> Optional[str]:
99+
"""Call the LLM with the given prompt and return the response."""
100+
try:
101+
process = subprocess.Popen(
102+
['llm', '-m', LLM_MODEL],
103+
stdin=subprocess.PIPE,
104+
stdout=subprocess.PIPE,
105+
stderr=subprocess.PIPE,
106+
text=True
107+
)
108+
109+
stdout, stderr = process.communicate(input=prompt, timeout=300)
110+
111+
if process.returncode != 0:
112+
print(f"LLM error: {stderr}", flush=True)
113+
return None
114+
115+
return stdout.strip() if stdout.strip() else None
116+
117+
except subprocess.TimeoutExpired:
118+
print("LLM call timed out", flush=True)
119+
process.kill()
120+
return None
121+
except Exception as e:
122+
print(f"Exception calling LLM: {e}", flush=True)
123+
return None
124+
125+
126+
def build_translation_prompt(
127+
keys_dict: Dict[str, str],
128+
target_language: str,
129+
full_en_data: Dict[str, str],
130+
existing_target_data: Dict[str, str]
131+
) -> str:
132+
"""Build the translation prompt for the LLM."""
133+
return f"""You are a professional translator working on localization for Harmonoid, a music player application. Translate the following JSON object from English to {target_language}.
134+
135+
CONTEXT: These strings are UI text for a music player app. They include terms related to music playback, playlists, albums, artists, audio settings, and media library management.
136+
137+
FULL ENGLISH LOCALIZATION (all strings for reference):
138+
{json.dumps(full_en_data, ensure_ascii=False, indent=2)}
139+
140+
EXISTING {target_language.upper()} TRANSLATIONS (for consistency reference):
141+
{json.dumps(existing_target_data, ensure_ascii=False, indent=2)}
142+
143+
IMPORTANT RULES:
144+
1. Keep all JSON keys EXACTLY the same (do not translate keys)
145+
2. Only translate the VALUES
146+
3. Preserve any special formatting like quotes (""), placeholders ("M", "N", "X", "ENTRY", "PLAYLIST", etc.)
147+
4. Maintain the same meaning, punctuation, capitalization, structure and formatting as the English source
148+
5. Use appropriate music/audio terminology for the target language
149+
6. Maintain CONSISTENCY with the existing translations shown above - use the same style, tone, and terminology choices
150+
7. For technical terms (e.g., "playlist", "equalizer"), check if they were translated or kept in English in existing translations and follow the same pattern
151+
8. Return ONLY the translated JSON object, no additional text or explanations
152+
9. Ensure the output is valid JSON
153+
10. Try to keep similar string length as the original English string (if possible and natural in the target language)
154+
155+
STRINGS TO TRANSLATE:
156+
{json.dumps(keys_dict, ensure_ascii=False, indent=2)}"""
157+
158+
159+
def translate_keys(
160+
keys_dict: Dict[str, str],
161+
target_language: str,
162+
full_en_data: Dict[str, str],
163+
existing_target_data: Dict[str, str]
164+
) -> Dict[str, str]:
165+
"""Translate a dictionary of keys using LLM."""
166+
if not keys_dict:
167+
return {}
168+
169+
print(f"Calling LLM...", flush=True)
170+
171+
prompt = build_translation_prompt(keys_dict, target_language, full_en_data, existing_target_data)
172+
response = call_llm(prompt)
173+
174+
if not response:
175+
print("Empty or failed LLM response, returning original keys", flush=True)
176+
return keys_dict
177+
178+
print(f"LLM returned successfully", flush=True)
179+
180+
# Strip markdown formatting
181+
content = strip_markdown_code_block(response)
182+
183+
# Parse JSON response
184+
try:
185+
translated = json.loads(content)
186+
187+
# Validate that all keys are present
188+
if not isinstance(translated, dict):
189+
print("LLM response is not a dictionary", flush=True)
190+
return keys_dict
191+
192+
missing_keys = set(keys_dict.keys()) - set(translated.keys())
193+
if missing_keys:
194+
print(f"Warning: Missing keys in translation: {missing_keys}", flush=True)
195+
# Fill in missing keys with original values
196+
for key in missing_keys:
197+
translated[key] = keys_dict[key]
198+
199+
return translated
200+
201+
except json.JSONDecodeError as e:
202+
print(f"JSON decode error: {e}", flush=True)
203+
print(f"Content preview: {content[:500]}...", flush=True)
204+
return keys_dict
205+
206+
207+
def translate_language(
208+
lang_code: str,
209+
lang_name: str,
210+
keys_to_translate: Dict[str, str],
211+
en_data: Dict[str, str],
212+
existing_data: Dict[str, str],
213+
localizations_dir: Path
214+
) -> bool:
215+
"""Translate all keys for a specific language."""
216+
if not keys_to_translate:
217+
print("Up to date", flush=True)
218+
return False
219+
220+
print(f"Translating {len(keys_to_translate)} keys...", flush=True)
221+
222+
# Translate in batches
223+
translated = {}
224+
keys = list(keys_to_translate.keys())
225+
total_batches = (len(keys) + BATCH_SIZE - 1) // BATCH_SIZE
226+
227+
for i in range(0, len(keys), BATCH_SIZE):
228+
batch_keys = keys[i:i + BATCH_SIZE]
229+
batch_dict = {k: keys_to_translate[k] for k in batch_keys}
230+
231+
batch_num = i // BATCH_SIZE + 1
232+
print(f"Batch {batch_num}/{total_batches} ({len(batch_keys)} keys)", flush=True)
233+
234+
batch_translated = translate_keys(batch_dict, lang_name, en_data, existing_data)
235+
translated.update(batch_translated)
236+
237+
# Merge translations with existing data and maintain key order from en_US.json
238+
final_data = {**existing_data, **translated}
239+
ordered_data = {k: final_data.get(k, en_data[k]) for k in en_data.keys()}
240+
241+
# Save the updated translations
242+
target_file = localizations_dir / f"{lang_code}.json"
243+
save_json(target_file, ordered_data)
244+
print(f"✓ Saved to {target_file.name}", flush=True)
245+
246+
return True
247+
248+
249+
def main() -> None:
250+
"""Main entry point for the translation script."""
251+
print("Starting translation script...", flush=True)
252+
253+
# Setup paths
254+
script_dir = Path(__file__).parent
255+
project_root = script_dir.parent.parent
256+
localizations_dir = project_root / "localizations"
257+
index_file = project_root / "index.json"
258+
en_file = localizations_dir / f"{SOURCE_LANGUAGE}.json"
259+
260+
print(f"Paths:", flush=True)
261+
print(f" project_root: {project_root}", flush=True)
262+
print(f" en_file: {en_file}", flush=True)
263+
264+
# Validate English localization file exists
265+
if not en_file.exists():
266+
print(f"Error: {en_file} not found", flush=True)
267+
sys.exit(1)
268+
269+
# Load English localization file
270+
try:
271+
en_data = load_json(en_file)
272+
print(f"Loaded {len(en_data)} keys from {SOURCE_LANGUAGE}.json", flush=True)
273+
except Exception:
274+
sys.exit(1)
275+
276+
# Get keys that were changed in the latest commit
277+
changed_keys = get_changed_keys(en_file)
278+
279+
if not changed_keys:
280+
print("No changed keys found - nothing to translate", flush=True)
281+
sys.exit(0)
282+
283+
print(f"Found {len(changed_keys)} changed keys: {', '.join(sorted(changed_keys))}", flush=True)
284+
285+
# Load list of available languages from index.json
286+
if not index_file.exists():
287+
print(f"Error: {index_file} not found", flush=True)
288+
sys.exit(1)
289+
290+
try:
291+
languages = load_json(index_file)
292+
print(f"Loaded {len(languages)} languages", flush=True)
293+
except Exception:
294+
sys.exit(1)
295+
296+
# Translate changed keys for each language
297+
translated_count = 0
298+
299+
for lang_info in languages:
300+
lang_code = lang_info.get('code')
301+
lang_name = lang_info.get('name')
302+
303+
if not lang_code or not lang_name:
304+
print(f"Warning: Invalid language entry: {lang_info}", flush=True)
305+
continue
306+
307+
# Skip English since it's the source language
308+
if lang_code == SOURCE_LANGUAGE:
309+
continue
310+
311+
print(f"\n[{lang_code}] {lang_name}", flush=True)
312+
313+
# Load existing translations for this language
314+
target_file = localizations_dir / f"{lang_code}.json"
315+
existing_data = load_json(target_file) if target_file.exists() else {}
316+
317+
# Filter to only keys that need translation
318+
keys_to_translate = {k: en_data[k] for k in changed_keys if k in en_data}
319+
320+
# Translate the language
321+
if translate_language(lang_code, lang_name, keys_to_translate, en_data, existing_data, localizations_dir):
322+
translated_count += 1
323+
324+
print(f"\n✓ Done - translated {translated_count} language(s)", flush=True)
325+
326+
327+
if __name__ == "__main__":
328+
main()

.github/workflows/translate.yml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
name: Translate
2+
3+
on:
4+
push:
5+
branches: ["main", "master"]
6+
paths:
7+
- "localizations/en_US.json"
8+
9+
jobs:
10+
translate:
11+
if: github.event.pusher.name == 'alexmercerind' && github.event.pusher.email == 'saini123hitesh@gmail.com'
12+
runs-on: ubuntu-latest
13+
permissions:
14+
contents: write
15+
pull-requests: write
16+
steps:
17+
- name: Checkout repository
18+
uses: actions/checkout@v4
19+
with:
20+
fetch-depth: 0
21+
- name: Set up Python
22+
uses: actions/setup-python@v5
23+
with:
24+
python-version: "3.11"
25+
- name: Install llm-github-models CLI
26+
run: |
27+
pip install llm-github-models
28+
- name: Configure llm
29+
env:
30+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
31+
run: |
32+
llm keys set github --value "$GITHUB_TOKEN"
33+
- name: Generate translations
34+
run: |
35+
python3 .github/scripts/translate.py
36+
- name: Check for changes
37+
id: check_changes
38+
run: |
39+
if git diff --quiet; then
40+
echo "has_changes=false" >> $GITHUB_OUTPUT
41+
else
42+
echo "has_changes=true" >> $GITHUB_OUTPUT
43+
fi
44+
- name: Create Pull Request
45+
if: steps.check_changes.outputs.has_changes == 'true'
46+
id: create_pr
47+
uses: peter-evans/create-pull-request@v6
48+
with:
49+
token: ${{ secrets.GITHUB_TOKEN }}
50+
commit-message: "chore: auto-translate localizations"
51+
title: "Auto-Translate Localizations"
52+
body: "This pull request contains automatically generated translations based on changes to `en_US.json`."
53+
branch: auto-translate-${{ github.run_number }}
54+
delete-branch: true
55+
- name: Assign reviewer to PR
56+
if: steps.check_changes.outputs.has_changes == 'true' && steps.create_pr.outputs.pull-request-number
57+
uses: actions/github-script@v7
58+
with:
59+
github-token: ${{ secrets.GITHUB_TOKEN }}
60+
script: |
61+
await github.rest.pulls.requestReviewers({
62+
owner: context.repo.owner,
63+
repo: context.repo.repo,
64+
pull_number: ${{ steps.create_pr.outputs.pull-request-number }},
65+
reviewers: ['gemini-code-assist']
66+
});

0 commit comments

Comments
 (0)