Skip to content

Commit 7e16428

Browse files
committed
modifications to preprocessing to avoid some hallucination issues
1 parent 8d9e997 commit 7e16428

File tree

2 files changed

+185
-59
lines changed

2 files changed

+185
-59
lines changed

scripts/translate/requirements.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ annotated-types==0.7.0
22
anyio==4.8.0
33
certifi==2024.12.14
44
distro==1.9.0
5-
h11==0.16.0
6-
httpcore==1.0.7
5+
h11
6+
httpcore
77
httpx==0.28.1
88
idna==3.10
99
jiter==0.8.2
@@ -16,3 +16,5 @@ tqdm==4.67.1
1616
typing_extensions==4.12.2
1717
xxhash==3.5.0
1818
llama_index==0.12.23
19+
python-frontmatter
20+
markdown-it-py

scripts/translate/translate.py

Lines changed: 181 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,30 @@
55
import xxhash
66
import argparse
77
import os
8+
import re
9+
import yaml
810

9-
from anthropic import Anthropic
1011
from llama_index.core import Document
1112
from llama_index.core.node_parser import MarkdownNodeParser
1213
import json
1314
import math
1415
import shutil
1516
from openai import OpenAI
16-
import anthropic
1717
from concurrent.futures import ThreadPoolExecutor, as_completed
18+
import frontmatter
19+
from markdown_it import MarkdownIt
1820

1921
TRANSLATE_EXCLUDED_FILES = {"about-us/adopters.md", "index.md", "integrations/language-clients/java/jdbc-v1.md"}
2022
TRANSLATE_EXCLUDED_FOLDERS = {"whats-new", "changelogs"}
2123
IGNORE_FOLDERS = {"ru", "zh"}
2224

23-
2425
client = OpenAI(
2526
api_key=os.environ.get("OPENAI_API_KEY"),
2627
)
2728
print(f"OpenAI API Key available: {'Yes' if os.environ.get('ANTHROPIC_API_KEY') else 'No'}")
2829

29-
anthropic_client = Anthropic(
30-
api_key=os.environ.get("ANTHROPIC_API_KEY"),
31-
)
32-
print(f"Anthropic API Key available: {'Yes' if os.environ.get('ANTHROPIC_API_KEY') else 'No'}")
33-
3430
MAX_CHUNK_SIZE = 30000
3531

36-
3732
def load_config(file_path):
3833
try:
3934
with open(file_path, "r", encoding="utf-8") as f:
@@ -50,7 +45,6 @@ def load_config(file_path):
5045
print(f"Config file not found at {file_path}. Exiting...")
5146
sys.exit(1)
5247

53-
5448
def format_glossary_prompt(glossary):
5549
glossary_text = "\n".join([f"- {key}: {value}" for key, value in glossary.items()])
5650
return f"Use the following glossary for specific translations of key technical terms. Take these into account even when translating YAML frontmatter fields like title, sidebar_label etc. Translate these words like this, within the context of the sentence:\n{glossary_text}\n"
@@ -92,60 +86,36 @@ def translate_text(config, text, model="gpt-4o-mini", translation_override_promp
9286
glossary = config["glossary"]
9387
prompt = config["prompt"] if "prompt" in config else f"""
9488
Translate the following ClickHouse documentation text from English to {language}. Ensure the following rules are followed:
95-
- This content may be part of a document, so maintain the original html tags and markdown formatting used in Docusaurus, including any headings, code blocks, lists, links, and inline formatting like bold or italic text. Code blocks should be preserved using ` and ```.
96-
- Ensure that no content, links, explicit heading ids (denoted by {{#my-explicit-id}}), or references are omitted or altered during translation, preserving the same amount of information as the original text.
97-
- Do not translate code, URLs, or any links within markdown. Mark down links must be preserved and never modified. Urls in text should be surrounded by white space and never have adjacent {language} characters.
89+
- This content may be part of a document, so maintain the original HTML tags and markdown formatting used in Docusaurus, including any headings, lists, links, and inline formatting like bold or italic text.
90+
IMPORTANT:
91+
- Ensure that no content, links, explicit heading ids (denoted by {{#my-explicit-id}}), or references are omitted or altered during translation, preserving the semantic meaning of the text.
92+
- Never translate URLs of markdown links like "[some text](../../sql-reference/statements/create/dictionary.md)". You may translate the text inside the square brackets if appropriate. Urls in text should be surrounded by white space and never have adjacent {language} characters.
9893
- Ensure the markdown is MDX 3 compatible - escaping < and > with &lt; and &gt; and avoiding the creation of unclosed xml tags.
99-
- Do not add new code delimiters which are not present in the original content e.g. '```html', even if the content appears to contain this type.
100-
- Do not translate terms which indicate setting names. These are denoted by lower case and underscore e.g. live_view_heartbeat_interval.
101-
- Do not translate terms in all caps which are SQL statements. For example DESCRIBE TABLE, RENAME, SET ROLE etc.
102-
- Translate the title, sidebar_label, keywords (list of single quoted strings) and description in yaml metadata blocks if they exist. Ensure these are wrapped in single quotes. Do not add entries.
103-
- This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content.
94+
- Never translate terms which indicate setting names. These are denoted by lower case and underscore e.g. live_view_heartbeat_interval or max_os_cpu_wait_time_ratio_to_throw.
95+
- Never translate terms in all caps which are SQL statements. For example DESCRIBE TABLE, RENAME, SET ROLE etc.
96+
- This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content.
10497
- Strive to convey the original meaning clearly, adapting phrases where necessary to maintain natural and fluent {language}.
98+
99+
I suggest a two step approach in which you first translate, and afterwards compare the original text to the translation
100+
and critically evaluate it and make modifications as appropriate.
101+
105102
"""
106103
glossary_prompt = format_glossary_prompt(glossary)
107-
prompt_content = f"{glossary_prompt}\n{prompt}\n{translation_override_prompt}"
104+
prompt_content = f"{prompt}\n{glossary_prompt}\n{translation_override_prompt}"
108105
try:
109-
if model=="claude-3-5-sonnet-20240620":
110-
with anthropic_client.messages.stream(
111-
max_tokens=8192, # max allowed for claude-3-5-sonnet-20240620
112-
messages=[
113-
{
114-
"role": "user",
115-
"content": [
116-
{
117-
"type": "text",
118-
"text": text
119-
}
120-
]
121-
}
122-
],
123-
model=model,
124-
system=prompt_content
125-
) as stream:
126-
full_response = ""
127-
128-
# Process each chunk as it arrives
129-
for chunk in stream:
130-
if chunk.type == "content_block_delta" and hasattr(chunk.delta, "text"):
131-
# Add this chunk of text to our response
132-
full_response += chunk.delta.text
133-
# Return the complete translated text
134-
return full_response
135-
else:
136-
completion = client.chat.completions.create(
137-
model=model,
138-
messages=[
139-
{"role": "system", "content": prompt_content},
140-
{"role": "user", "content": text}
141-
]
142-
)
143-
return completion.choices[0].message.content
106+
completion = client.chat.completions.create(
107+
model=model,
108+
messages=[
109+
{"role": "system", "content": prompt_content},
110+
{"role": "user", "content": text}
111+
]
112+
)
113+
return completion.choices[0].message.content
114+
144115
except Exception as e:
145116
print(f"failed to translate: {e}")
146117
return None
147118

148-
149119
def split_text(text, input_file_path, max_chunk_size=MAX_CHUNK_SIZE):
150120
if len(text) <= max_chunk_size:
151121
return [text]
@@ -171,16 +141,153 @@ def split_text(text, input_file_path, max_chunk_size=MAX_CHUNK_SIZE):
171141

172142
return chunks
173143

144+
class QuotedStringDumper(yaml.SafeDumper):
145+
def represent_str(self, data):
146+
return yaml.ScalarNode('tag:yaml.org,2002:str', data, style="'")
147+
148+
# Configure YAML dumper to use single quotes for strings
149+
QuotedStringDumper.add_representer(str, QuotedStringDumper.represent_str)
150+
def translate_frontmatter(frontmatter, glossary):
151+
# Translate just the fields we need
152+
153+
frontmatter_json = json.dumps(frontmatter)
154+
155+
system_prompt = f"""
156+
You are an expert translator of technical documentation.
157+
Translate the values for the following keys which are part of YAML frontmatter
158+
of a markdown document:
159+
- title
160+
- sidebar_label
161+
- description
162+
163+
Respond with a JSON object containing only the translated fields.
164+
165+
IMPORTANT: do not translate any SQL terms, or terms which are likely to be
166+
specific features or functions of ClickHouse.
167+
168+
You can use the following glossary for technical terms:
169+
170+
{glossary}
171+
"""
172+
completion = client.chat.completions.create(
173+
model="gpt-3.5-turbo-0125",
174+
messages=[
175+
{"role": "system", "content": system_prompt},
176+
{"role": "user", "content": frontmatter_json}
177+
],
178+
response_format={ "type": "json_object" }
179+
)
180+
181+
translated_content = json.loads(completion.choices[0].message.content)
182+
183+
for key in ["title", "sidebar_label", "description"]:
184+
if key in translated_content and key in frontmatter:
185+
frontmatter[key] = translated_content[key]
186+
187+
def extract_import_statements(text):
188+
# Regular expression to match import statements
189+
import_regex = r'^import\s+.+\s+from\s+[\'"].+[\'"];?$'
190+
191+
# Find all matches
192+
import_statements = re.findall(import_regex, text, re.MULTILINE)
193+
194+
return import_statements
195+
196+
def remove_import_statements(text):
197+
# Regular expression to match import statements
198+
import_regex = r'^import\s+.+\s+from\s+[\'"].+[\'"];?$'
199+
200+
# Remove all import statements
201+
cleaned_text = re.sub(import_regex, '', text, flags=re.MULTILINE)
202+
203+
# Clean up any resulting multiple newlines
204+
cleaned_text = re.sub(r'\n{2,}', '\n', cleaned_text)
205+
206+
return cleaned_text.strip()
207+
208+
def replace_code_blocks_with_custom_placeholders(markdown_text):
209+
lines = markdown_text.split('\n')
210+
result_lines = []
211+
code_blocks = []
212+
213+
in_code_block = False
214+
current_block = {
215+
'language': '',
216+
'content': []
217+
}
218+
219+
for line in lines:
220+
if line.strip().startswith('```') and not in_code_block:
221+
# Start of a code block
222+
in_code_block = True
223+
language_part = line.strip()[3:].strip() # Remove ``` and whitespace
224+
current_block = {
225+
'language': language_part,
226+
'content': []
227+
}
228+
elif line.strip() == '```' and in_code_block:
229+
# End of a code block
230+
in_code_block = False
231+
code_blocks.append({
232+
'language': current_block['language'],
233+
'content': '\n'.join(current_block['content'])
234+
})
235+
result_lines.append(f"<CODEBLOCK_{len(code_blocks)}>")
236+
elif in_code_block:
237+
# Inside a code block
238+
current_block['content'].append(line)
239+
else:
240+
# Outside a code block
241+
result_lines.append(line)
242+
return '\n'.join(result_lines), code_blocks
243+
244+
def restore_code_blocks(modified_text, code_blocks):
245+
246+
restored_text = modified_text
247+
248+
# Replace each placeholder with its corresponding code block
249+
for i, block in enumerate(code_blocks, 1):
250+
language = block['language']
251+
content = block['content']
252+
253+
# Create the code block with proper backticks and language
254+
if language:
255+
code_block = f"```{language}\n{content}\n```"
256+
else:
257+
code_block = f"```\n{content}\n```"
258+
259+
# Replace the placeholder
260+
placeholder = f"<CODEBLOCK_{i}>"
261+
restored_text = restored_text.replace(placeholder, code_block)
262+
263+
return restored_text
264+
174265
def translate_file(config, input_file_path, output_file_path, model):
175266
print(f"Starting translation: input[{input_file_path}], output[{output_file_path}]")
176267
start_time = time.time()
177268

178269
try:
179270
with open(input_file_path, "r", encoding="utf-8") as input_file:
180-
original_text = input_file.read()
271+
# Before splitting text into chunks, split the content and the frontmatter
272+
metadata, original_text = frontmatter.parse(input_file.read())
181273
print(f" - length: {len(original_text)}")
274+
275+
if input_file_path == "/Users/sstruw/Desktop/clickhouse-docs/docs/operations/settings/settings.md":
276+
pause_here = ""
277+
278+
# Translate the metadata
279+
translate_frontmatter(metadata, config["glossary"])
280+
281+
# Next extract all import statements from the text
282+
imports = extract_import_statements(original_text)
283+
cleaned_text = remove_import_statements(original_text)
284+
285+
# Extract codeblocks and replace them with numbered placeholders
286+
# that we will replace after translations are done.
287+
cleaned_text, code_blocks = replace_code_blocks_with_custom_placeholders(cleaned_text)
288+
182289
# Split text into chunks and translate
183-
num_chunk = math.ceil(len(original_text) / MAX_CHUNK_SIZE)
290+
num_chunk = math.ceil(len(cleaned_text) / MAX_CHUNK_SIZE)
184291
count = 1
185292
translated_text = ""
186293
chunks = split_text(original_text, input_file_path, MAX_CHUNK_SIZE)
@@ -218,6 +325,23 @@ def translate_file(config, input_file_path, output_file_path, model):
218325

219326
c=0
220327
bt = False
328+
329+
# Now we work backwards
330+
translated_text = restore_code_blocks(translated_text, code_blocks)
331+
332+
imports_text = "\n".join(imports)
333+
translated_text = imports_text + "\n\n" + translated_text
334+
335+
yaml_str = yaml.dump(
336+
metadata,
337+
Dumper=QuotedStringDumper,
338+
default_flow_style=False,
339+
sort_keys=False,
340+
allow_unicode=True
341+
)
342+
formatted_frontmatter = f"---\n{yaml_str}---"
343+
translated_text = formatted_frontmatter + "\n\n" + translated_text
344+
221345
with open(output_file_path, "w", encoding="utf-8") as output_file:
222346
lines = translated_text.splitlines()
223347
for line in lines:

0 commit comments

Comments
 (0)