Skip to content

Commit dbe9819

Browse files
authored
Improve translation consistency with temperature=0 and diff-aware updates (#860)
* fix(translate): set temperature=0 for deterministic translations Reduces non-deterministic variation in translations by setting temperature=0 for Claude API calls. This produces more consistent, literal translations which is appropriate for technical documentation. * feat(translate): add diff-aware incremental translation updates When English source content changes (not prompt changes), the system now: 1. Computes the git diff of what changed in the English file 2. Includes the diff in the prompt with explicit instructions to: - Update ONLY sections corresponding to the changes - Preserve all other content exactly as-is - Allow fixing major issues (truncation, missing sections) if found This reduces unnecessary rephrasing of translations that are already correct, resulting in smaller, more focused diffs in translation PRs. The diff-aware mode is skipped when: - Prompt files changed (full re-translation needed) - Diff is >20% of file lines (too large for incremental approach) - No existing translation exists (new file) Also adds 'prompt_changed' field to TranslationFile to distinguish between prompt-triggered and content-triggered updates.
1 parent 8bc38de commit dbe9819

File tree

1 file changed

+107
-12
lines changed

1 file changed

+107
-12
lines changed

_scripts/translate.py

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ class TranslationFile:
101101
en_path: Path
102102
lang_path: Path
103103
language: str
104+
prompt_changed: bool = False # True if update triggered by prompt change
104105

105106
@property
106107
def exists(self) -> bool:
@@ -229,18 +230,51 @@ def get_lang_prompt(lang: str) -> str:
229230

230231

231232
def build_translation_prompt(
232-
lang: str, lang_name: str, en_content: str, existing: str | None = None
233+
lang: str,
234+
lang_name: str,
235+
en_content: str,
236+
existing: str | None = None,
237+
en_diff: str | None = None,
233238
) -> str:
234-
"""Build the full prompt for translation."""
239+
"""Build the full prompt for translation.
240+
241+
Args:
242+
lang: Language code (e.g., 'pt', 'es')
243+
lang_name: Human-readable language name (e.g., 'português')
244+
en_content: Full English source content
245+
existing: Existing translation content, if updating
246+
en_diff: Git diff of English changes, for incremental updates
247+
"""
235248
parts = [get_general_prompt(), get_lang_prompt(lang)]
236249

237250
if existing:
238-
parts.append(
239-
"## Existing Translation\n"
240-
"Update minimally: add new content, remove deleted content, "
241-
"fix guideline violations, preserve correct lines exactly.\n\n"
242-
f"Previous translation:\n%%%\n{existing}%%%"
243-
)
251+
if en_diff:
252+
# Diff-aware incremental update mode
253+
parts.append(
254+
"## Incremental Update Mode\n\n"
255+
"The English source has been updated. "
256+
"The following diff shows exactly what changed:\n\n"
257+
f"```diff\n{en_diff}\n```\n\n"
258+
"**Instructions:**\n\n"
259+
"1. Locate the corresponding section(s) in your existing translation\n"
260+
"2. Update ONLY those specific sections to reflect the English changes\n"
261+
"3. Preserve ALL other content exactly as-is, character for character\n"
262+
"4. Do NOT rephrase, improve, or modify any sections unrelated to the diff\n\n"
263+
"**Exception:** If you find major issues beyond the diff (e.g., truncated "
264+
"content, missing sections, or significant errors), you may fix those as well. "
265+
"The goal is to avoid unnecessary rephrasing of translations that are already "
266+
"correct.\n\n"
267+
f"## Existing Translation\n\n%%%\n{existing}%%%"
268+
)
269+
else:
270+
# Full re-translation with existing as reference
271+
# (prompt changed, diff too large, or diff unavailable)
272+
parts.append(
273+
"## Existing Translation\n"
274+
"Update minimally: add new content, remove deleted content, "
275+
"fix guideline violations, preserve correct lines exactly.\n\n"
276+
f"Previous translation:\n%%%\n{existing}%%%"
277+
)
244278

245279
parts.append(
246280
f"## Task\nTranslate to {lang} ({lang_name}).\n\n"
@@ -282,6 +316,45 @@ def file_changed_since(path: Path, since_commit: str) -> bool:
282316
return False
283317

284318

319+
def get_file_diff(path: Path, since_commit: str) -> str | None:
320+
"""Get the git diff for a file since a specific commit.
321+
322+
Args:
323+
path: File path to get diff for
324+
since_commit: Commit SHA to compare against
325+
326+
Returns:
327+
The unified diff string, or None if:
328+
- No changes detected
329+
- Diff is too large (>20% of current file lines)
330+
- Git command fails
331+
"""
332+
try:
333+
repo = _get_repo()
334+
rel_path = path.relative_to(REPO_ROOT)
335+
336+
# Get the unified diff
337+
diff = repo.git.diff(f"{since_commit}..HEAD", "--", str(rel_path))
338+
if not diff:
339+
return None
340+
341+
# Check if diff is too large (>20% of file lines)
342+
# If so, fall back to full translation mode
343+
file_lines = path.read_text(encoding="utf-8").count("\n") + 1
344+
# Count actual change lines (starting with + or -), not context/headers
345+
diff_changes = sum(
346+
1
347+
for line in diff.split("\n")
348+
if line.startswith(("+", "-")) and not line.startswith(("+++", "---"))
349+
)
350+
if file_lines > 0 and diff_changes > file_lines * 0.2:
351+
return None
352+
353+
return diff
354+
except git.GitCommandError:
355+
return None
356+
357+
285358
def get_translation_baseline(
286359
github_token: str | None = None,
287360
github_repository: str | None = None,
@@ -396,10 +469,14 @@ def get_outdated_files(lang: str, baseline: str | None = None) -> list[Translati
396469

397470
if prompts_changed:
398471
# Prompt changed: all existing translations need re-translation
399-
outdated.append(TranslationFile(en_path, lang_path, lang))
472+
outdated.append(
473+
TranslationFile(en_path, lang_path, lang, prompt_changed=True)
474+
)
400475
elif file_changed_since(en_path, baseline):
401476
# Check if English file changed since baseline
402-
outdated.append(TranslationFile(en_path, lang_path, lang))
477+
outdated.append(
478+
TranslationFile(en_path, lang_path, lang, prompt_changed=False)
479+
)
403480

404481
return outdated
405482

@@ -453,6 +530,7 @@ def _call_claude_once(
453530
max_tokens=MAX_TOKENS,
454531
timeout=REQUEST_TIMEOUT,
455532
messages=messages,
533+
temperature=0,
456534
)
457535
except (
458536
anthropic.APIConnectionError,
@@ -538,6 +616,7 @@ async def _call_claude_once_async(
538616
max_tokens=MAX_TOKENS,
539617
timeout=REQUEST_TIMEOUT,
540618
messages=messages,
619+
temperature=0,
541620
)
542621
except (
543622
anthropic.APIConnectionError,
@@ -640,8 +719,17 @@ def translate_file(tf: TranslationFile, console: Console) -> None:
640719
action = "[yellow]Updating[/yellow]" if existing else "[green]Translating[/green]"
641720
console.print(f" {action} [magenta]{tf.relative_path}[/magenta]")
642721

722+
# Compute diff for incremental updates (not prompt-triggered, existing translation)
723+
en_diff = None
724+
if existing and not tf.prompt_changed:
725+
baseline = get_translation_baseline(tf.language)
726+
if baseline:
727+
en_diff = get_file_diff(tf.en_path, baseline)
728+
if en_diff:
729+
console.print(" [dim]Using incremental update mode[/dim]")
730+
643731
prompt = build_translation_prompt(
644-
tf.language, langs[tf.language], en_content, existing
732+
tf.language, langs[tf.language], en_content, existing, en_diff
645733
)
646734
result = call_claude(prompt, console)
647735

@@ -821,8 +909,15 @@ async def translate_file_async(
821909
entry.input_lines = en_content.count("\n") + 1
822910
entry.input_hash = hashlib.md5(en_content.encode()).hexdigest()[:12]
823911

912+
# Compute diff for incremental updates (not prompt-triggered, existing translation)
913+
en_diff = None
914+
if existing and not tf.prompt_changed:
915+
baseline = get_translation_baseline(tf.language)
916+
if baseline:
917+
en_diff = get_file_diff(tf.en_path, baseline)
918+
824919
prompt = build_translation_prompt(
825-
tf.language, langs[tf.language], en_content, existing
920+
tf.language, langs[tf.language], en_content, existing, en_diff
826921
)
827922
result = await call_claude_async(prompt, filename, client)
828923
output_content = f"{result.text}\n"

0 commit comments

Comments
 (0)