Skip to content

Commit 313fd32

Browse files
authored
Merge branch 'main' into landing_pages_about
2 parents 766f753 + 9aa9b47 commit 313fd32

File tree

4 files changed

+42
-51
lines changed

4 files changed

+42
-51
lines changed

scripts/.markdownlint-cli2.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ ignores:
2020
# add or remove ignored directories here
2121
- "docs/ru"
2222
- "docs/zh"
23-
- "docs/en/whats-new"
24-
- "docs/en/_placeholders"
23+
- "docs/whats-new"
24+
- "docs/_placeholders"
2525
- "docs/operations/settings/settings.md" # autogenerated
2626
- "docs/operations/settings/settings-formats.md" # autogenerated
27+
- "docs/cloud/manage/api"
2728
customRules:
2829
# add custom rules here
2930
- "./markdownlint/rules/links_url_type.js"

scripts/autogenerate-table-of-contents.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ python3 scripts/table-of-contents-generator/toc_gen.py --single-toc --dir="docs/
2828
python3 scripts/table-of-contents-generator/toc_gen.py --single-toc --dir="docs/engines/table-engines/integrations" --md="docs/engines/table-engines/integrations/index.md"
2929
python3 scripts/table-of-contents-generator/toc_gen.py --single-toc --dir="docs/engines/table-engines/special" --md="docs/engines/table-engines/special/index.md"
3030
python3 scripts/table-of-contents-generator/toc_gen.py --single-toc --dir="docs/getting-started/example-datasets" --md="docs/getting-started/index.md" --ignore images
31+
python3 scripts/table-of-contents-generator/toc_gen.py --single-toc --dir="docs/sql-reference/aggregate-functions/reference" --md="docs/sql-reference/aggregate-functions/reference/index.md"
32+
python3 scripts/table-of-contents-generator/toc_gen.py --single-toc --dir="docs/sql-reference/table-functions" --md="docs/sql-reference/table-functions/index.md"
3133
python3 scripts/table-of-contents-generator/toc_gen.py --single-toc --dir="docs/cloud/changelogs" --md="docs/cloud/reference/release-notes-index.md"
3234
deactivate
3335
rm -r venv

scripts/translate/translate.py

Lines changed: 35 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,12 @@
55
import argparse
66
import json
77
import math
8-
from typing import Dict, Optional
8+
import shutil
99
from openai import OpenAI
1010
from concurrent.futures import ThreadPoolExecutor
11-
import re
12-
from pydantic import BaseModel, Field
1311

14-
15-
EXCLUDED_FILES = {"about-us/adopters.md"}
16-
EXCLUDED_FOLDERS = {"whats-new", "changelogs"}
12+
TRANSLATE_EXCLUDED_FILES = {"about-us/adopters.md", "index.md"}
13+
TRANSLATE_EXCLUDED_FOLDERS = {"whats-new", "changelogs"}
1714

1815
client = OpenAI(
1916
api_key=os.environ.get("OPENAI_API_KEY"),
@@ -48,7 +45,8 @@ def format_glossary_prompt(glossary):
4845
def translate_text(config, text, model="gpt-4o-mini"):
4946
language = config["language"]
5047
glossary = config["glossary"]
51-
prompt = config["prompt"] if "prompt" in config else f"Translate the following ClickHouse documentation text from English to {language}. This content may be part of a document, so maintain the original html tags and markdown formatting used in Docusaurus, including any headings, code blocks, lists, links, and inline formatting like bold or italic text. Ensure that no content, links, or references are omitted or altered during translation, preserving the same amount of information as the original text. Do not translate code, URLs, or any links within markdown. This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content. Strive to convey the original meaning clearly, adapting phrases where necessary to maintain natural and fluent {language}."
48+
prompt = config[
49+
"prompt"] if "prompt" in config else f"Translate the following ClickHouse documentation text from English to {language}. This content may be part of a document, so maintain the original html tags and markdown formatting used in Docusaurus, including any headings, code blocks, lists, links, and inline formatting like bold or italic text. Ensure that no content, links, explicit heading ids (denoted by {{#my-explicit-id}}), or references are omitted or altered during translation, preserving the same amount of information as the original text. Do not translate code, URLs, or any links within markdown. This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content. Strive to convey the original meaning clearly, adapting phrases where necessary to maintain natural and fluent {language}."
5250
glossary_prompt = format_glossary_prompt(glossary)
5351
prompt_content = f"{glossary_prompt}\n{prompt}"
5452
try:
@@ -82,24 +80,6 @@ def split_text(text, max_chunk_size):
8280
return chunks
8381

8482

85-
def process_page_new_language(content, lang_code, is_intro=False):
86-
replacements = [
87-
(r"slug: /en/", f"slug: /{lang_code}/"),
88-
(r"slug: '/en/", f"slug: '/{lang_code}/"),
89-
(r'slug: "/en/', f'slug: "/{lang_code}/'),
90-
(r"\(/docs/en/", f"(/docs/{lang_code}/"),
91-
(r"\]\(/en/", f"](/{lang_code}/"),
92-
(r"@site/docs/", f"@site/docs/{lang_code}/"),
93-
(r'"/docs/en/', f'"/docs/{lang_code}/'),
94-
(r"clickhouse.com/docs/en", f"clickhouse.com/docs/{lang_code}"),
95-
]
96-
for pattern, replacement in replacements:
97-
content = re.sub(pattern, replacement, content)
98-
if is_intro:
99-
content = re.sub(r"^---$", f"---\nslug: /{lang_code}", content, count=1, flags=re.MULTILINE)
100-
return content
101-
102-
10383
def translate_file(config, input_file_path, output_file_path, model):
10484
print(f"start translation: input[{input_file_path}], output[{output_file_path}]")
10585
start_time = time.time()
@@ -108,7 +88,6 @@ def translate_file(config, input_file_path, output_file_path, model):
10888
with open(input_file_path, "r", encoding="utf-8") as input_file:
10989
original_text = input_file.read()
11090
print(f" - length: {len(original_text)}")
111-
original_text = process_page_new_language(original_text, config["lang_code"])
11291
# Split text into chunks and translate
11392
num_chunk = math.ceil(len(original_text) / MAX_CHUNK_SIZE)
11493
count = 1
@@ -118,19 +97,16 @@ def translate_file(config, input_file_path, output_file_path, model):
11897
translated_chunk = translate_text(config, chunk, model)
11998
if translated_chunk:
12099
translated_text += translated_chunk + "\n"
121-
count+=1
100+
count += 1
122101
else:
123102
print(f"failed to translate a chunk: [{input_file_path}]")
124103
return
125104

126105
with open(output_file_path, "w", encoding="utf-8") as output_file:
127106
output_file.write(translated_text)
128107

129-
# Rename input file with .translated suffix
130-
translated_file_name = f"{os.path.basename(input_file_path)}.translated"
131-
translated_file_path = os.path.join(os.path.dirname(input_file_path), translated_file_name)
132-
133-
os.rename(input_file_path, translated_file_path)
108+
# Rename output file with .translate suffix to .translated
109+
os.rename(output_file_path, f"{output_file_path}d")
134110

135111
except FileNotFoundError:
136112
print(f"no file: {input_file_path}")
@@ -139,41 +115,48 @@ def translate_file(config, input_file_path, output_file_path, model):
139115

140116
end_time = time.time()
141117
duration = end_time - start_time
142-
print(f"finished translation: input[{input_file_path}], output[{output_file_path}], duration seconds[{duration:.2f}]")
118+
print(
119+
f"finished translation: input[{input_file_path}], output[{output_file_path}], duration seconds[{duration:.2f}]")
143120

144121

145-
def translate_folder(config, input_folder, output_folder, model="gpt-4o-mini"):
122+
def translate_docs_folder(config, input_folder, output_folder, model="gpt-4o-mini"):
146123
with ThreadPoolExecutor(max_workers=5) as executor:
147124
futures = []
148125
for root, _, files in os.walk(input_folder):
149126
relative_folder_path = os.path.relpath(root, input_folder)
150-
if any(excluded in relative_folder_path for excluded in EXCLUDED_FOLDERS):
151-
print(f" - Skipping due to exclusion target: {relative_folder_path}")
127+
if any(excluded in relative_folder_path for excluded in TRANSLATE_EXCLUDED_FOLDERS):
128+
print(f"Skipping translation due to excluded folder target: {relative_folder_path}")
129+
shutil.copytree(os.path.join(input_folder, relative_folder_path), os.path.join(output_folder, relative_folder_path),dirs_exist_ok=True)
152130
continue
153131

154132
for file in files:
155133
input_file_path = os.path.join(root, file)
156134
relative_path = os.path.relpath(input_file_path, input_folder)
157-
output_file_path = os.path.join(output_folder, relative_path + ".translated")
158135
if file.endswith((".md", ".mdx")):
159-
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
160-
161136
# Skip files that are in the excluded files set
162-
if relative_path in EXCLUDED_FILES:
163-
print(f" - Skipping due to exclusion target: {input_file_path}")
137+
if relative_path in TRANSLATE_EXCLUDED_FILES:
138+
output_file_path = os.path.join(output_folder, relative_path)
139+
print(f"Skipping translation due to exclusion target: {input_file_path}")
140+
shutil.copy(input_file_path, output_file_path)
164141
continue
165-
166142
# Skip files that already have the translated suffix - allows continuing from failed point
167-
if file.endswith(".translated"):
143+
if os.path.exists(os.path.join(output_folder, relative_path + ".translated")):
144+
print(f"Skipping ${input_file_path} translation due to already translated file")
168145
continue
169-
146+
# re-do files partially through translation
147+
if os.path.exists(os.path.join(output_folder, relative_path + ".translate")):
148+
os.remove(os.path.join(output_folder, relative_path + ".translate"))
149+
output_file_path = os.path.join(output_folder, relative_path + ".translate")
150+
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
170151
# Submit the translation task to be run in parallel
171152
futures.append(executor.submit(translate_file, config, input_file_path, output_file_path, model))
172153
else:
173154
# symlink these files as we want to update in a single place
174155
try:
156+
output_file_path = os.path.join(output_folder, relative_path)
175157
if os.path.exists(output_file_path) or os.path.islink(output_file_path):
176158
os.remove(output_file_path) # Remove existing file/link before creating symlink
159+
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
177160
os.symlink(input_file_path, output_file_path)
178161
print(f" - Created symlink: {output_file_path} -> {input_file_path}")
179162
except OSError as e:
@@ -183,6 +166,7 @@ def translate_folder(config, input_folder, output_folder, model="gpt-4o-mini"):
183166
for future in futures:
184167
future.result()
185168

169+
186170
def rename_translated_files(output_folder):
187171
for root, _, files in os.walk(output_folder):
188172
for file in files:
@@ -201,7 +185,8 @@ def rename_translated_files(output_folder):
201185

202186

203187
def translate_plugin_data(output_folder, config, model="gpt-4o-mini"):
204-
json_files = glob.glob(os.path.join(output_folder, "*.json")) + glob.glob(os.path.join(output_folder, "*", "*.json"))
188+
json_files = glob.glob(os.path.join(output_folder, "*.json")) + glob.glob(
189+
os.path.join(output_folder, "*", "*.json"))
205190
language = config["language"]
206191
glossary = config["glossary"]
207192
prompt = f"""
@@ -223,7 +208,7 @@ def translate_plugin_data(output_folder, config, model="gpt-4o-mini"):
223208
{"role": "system", "content": prompt_content},
224209
{"role": "user", "content": json.dumps(text)},
225210
],
226-
response_format={ "type": "json_object" }
211+
response_format={"type": "json_object"}
227212
)
228213
translated_text = completion.choices[0].message.content
229214
translated_config = json.loads(translated_text)
@@ -234,6 +219,7 @@ def translate_plugin_data(output_folder, config, model="gpt-4o-mini"):
234219
print(f"failed to translate: {e}")
235220
raise e
236221

222+
237223
script_dir = os.path.dirname(os.path.abspath(__file__))
238224
default_input_folder = os.path.abspath(os.path.join(script_dir, "../../docs/"))
239225

@@ -252,8 +238,9 @@ def main():
252238
parser.add_argument("--model", default="gpt-4o-mini", help="Specify the OpenAI model to use for translation")
253239
args = parser.parse_args()
254240
config = load_config(args.config)
255-
translate_plugin_data(args.output_folder, config, model=args.model)
256-
translate_folder(config, args.input_folder, args.output_folder, args.model)
241+
# translate_plugin_data(args.output_folder, config, model=args.model)
242+
translate_docs_folder(config, args.input_folder,
243+
os.path.join(args.output_folder + "/docusaurus-plugin-content-docs/current"), args.model)
257244
rename_translated_files(args.output_folder)
258245

259246

sidebars.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,7 @@ const sidebars = {
496496
label: "Engines",
497497
collapsed: false,
498498
collapsible: false,
499+
link: {type: "doc", id: "engines/index"},
499500
items: [
500501
{
501502
type: "autogenerated",
@@ -1755,7 +1756,7 @@ const sidebars = {
17551756
type: "link",
17561757
label: "Engines",
17571758
description: "Use the right table and database engines for your data",
1758-
href: "/engines/database-engines"
1759+
href: "/engines"
17591760
},
17601761
{
17611762
type: "link",

0 commit comments

Comments
 (0)