5
5
import argparse
6
6
import json
7
7
import math
8
- from typing import Dict , Optional
8
+ import shutil
9
9
from openai import OpenAI
10
10
from concurrent .futures import ThreadPoolExecutor
11
- import re
12
- from pydantic import BaseModel , Field
13
11
14
-
15
- EXCLUDED_FILES = {"about-us/adopters.md" }
16
- EXCLUDED_FOLDERS = {"whats-new" , "changelogs" }
12
+ TRANSLATE_EXCLUDED_FILES = {"about-us/adopters.md" , "index.md" }
13
+ TRANSLATE_EXCLUDED_FOLDERS = {"whats-new" , "changelogs" }
17
14
18
15
client = OpenAI (
19
16
api_key = os .environ .get ("OPENAI_API_KEY" ),
@@ -48,7 +45,8 @@ def format_glossary_prompt(glossary):
48
45
def translate_text (config , text , model = "gpt-4o-mini" ):
49
46
language = config ["language" ]
50
47
glossary = config ["glossary" ]
51
- prompt = config ["prompt" ] if "prompt" in config else f"Translate the following ClickHouse documentation text from English to { language } . This content may be part of a document, so maintain the original html tags and markdown formatting used in Docusaurus, including any headings, code blocks, lists, links, and inline formatting like bold or italic text. Ensure that no content, links, or references are omitted or altered during translation, preserving the same amount of information as the original text. Do not translate code, URLs, or any links within markdown. This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content. Strive to convey the original meaning clearly, adapting phrases where necessary to maintain natural and fluent { language } ."
48
+ prompt = config [
49
+ "prompt" ] if "prompt" in config else f"Translate the following ClickHouse documentation text from English to { language } . This content may be part of a document, so maintain the original html tags and markdown formatting used in Docusaurus, including any headings, code blocks, lists, links, and inline formatting like bold or italic text. Ensure that no content, links, explicit heading ids (denoted by {{#my-explicit-id}}), or references are omitted or altered during translation, preserving the same amount of information as the original text. Do not translate code, URLs, or any links within markdown. This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content. Strive to convey the original meaning clearly, adapting phrases where necessary to maintain natural and fluent { language } ."
52
50
glossary_prompt = format_glossary_prompt (glossary )
53
51
prompt_content = f"{ glossary_prompt } \n { prompt } "
54
52
try :
@@ -82,24 +80,6 @@ def split_text(text, max_chunk_size):
82
80
return chunks
83
81
84
82
85
- def process_page_new_language (content , lang_code , is_intro = False ):
86
- replacements = [
87
- (r"slug: /en/" , f"slug: /{ lang_code } /" ),
88
- (r"slug: '/en/" , f"slug: '/{ lang_code } /" ),
89
- (r'slug: "/en/' , f'slug: "/{ lang_code } /' ),
90
- (r"\(/docs/en/" , f"(/docs/{ lang_code } /" ),
91
- (r"\]\(/en/" , f"](/{ lang_code } /" ),
92
- (r"@site/docs/" , f"@site/docs/{ lang_code } /" ),
93
- (r'"/docs/en/' , f'"/docs/{ lang_code } /' ),
94
- (r"clickhouse.com/docs/en" , f"clickhouse.com/docs/{ lang_code } " ),
95
- ]
96
- for pattern , replacement in replacements :
97
- content = re .sub (pattern , replacement , content )
98
- if is_intro :
99
- content = re .sub (r"^---$" , f"---\n slug: /{ lang_code } " , content , count = 1 , flags = re .MULTILINE )
100
- return content
101
-
102
-
103
83
def translate_file (config , input_file_path , output_file_path , model ):
104
84
print (f"start translation: input[{ input_file_path } ], output[{ output_file_path } ]" )
105
85
start_time = time .time ()
@@ -108,7 +88,6 @@ def translate_file(config, input_file_path, output_file_path, model):
108
88
with open (input_file_path , "r" , encoding = "utf-8" ) as input_file :
109
89
original_text = input_file .read ()
110
90
print (f" - length: { len (original_text )} " )
111
- original_text = process_page_new_language (original_text , config ["lang_code" ])
112
91
# Split text into chunks and translate
113
92
num_chunk = math .ceil (len (original_text ) / MAX_CHUNK_SIZE )
114
93
count = 1
@@ -118,19 +97,16 @@ def translate_file(config, input_file_path, output_file_path, model):
118
97
translated_chunk = translate_text (config , chunk , model )
119
98
if translated_chunk :
120
99
translated_text += translated_chunk + "\n "
121
- count += 1
100
+ count += 1
122
101
else :
123
102
print (f"failed to translate a chunk: [{ input_file_path } ]" )
124
103
return
125
104
126
105
with open (output_file_path , "w" , encoding = "utf-8" ) as output_file :
127
106
output_file .write (translated_text )
128
107
129
- # Rename input file with .translated suffix
130
- translated_file_name = f"{ os .path .basename (input_file_path )} .translated"
131
- translated_file_path = os .path .join (os .path .dirname (input_file_path ), translated_file_name )
132
-
133
- os .rename (input_file_path , translated_file_path )
108
+ # Rename output file with .translate suffix to .translated
109
+ os .rename (output_file_path , f"{ output_file_path } d" )
134
110
135
111
except FileNotFoundError :
136
112
print (f"no file: { input_file_path } " )
@@ -139,41 +115,48 @@ def translate_file(config, input_file_path, output_file_path, model):
139
115
140
116
end_time = time .time ()
141
117
duration = end_time - start_time
142
- print (f"finished translation: input[{ input_file_path } ], output[{ output_file_path } ], duration seconds[{ duration :.2f} ]" )
118
+ print (
119
+ f"finished translation: input[{ input_file_path } ], output[{ output_file_path } ], duration seconds[{ duration :.2f} ]" )
143
120
144
121
145
- def translate_folder (config , input_folder , output_folder , model = "gpt-4o-mini" ):
122
+ def translate_docs_folder (config , input_folder , output_folder , model = "gpt-4o-mini" ):
146
123
with ThreadPoolExecutor (max_workers = 5 ) as executor :
147
124
futures = []
148
125
for root , _ , files in os .walk (input_folder ):
149
126
relative_folder_path = os .path .relpath (root , input_folder )
150
- if any (excluded in relative_folder_path for excluded in EXCLUDED_FOLDERS ):
151
- print (f" - Skipping due to exclusion target: { relative_folder_path } " )
127
+ if any (excluded in relative_folder_path for excluded in TRANSLATE_EXCLUDED_FOLDERS ):
128
+ print (f"Skipping translation due to excluded folder target: { relative_folder_path } " )
129
+ shutil .copytree (os .path .join (input_folder , relative_folder_path ), os .path .join (output_folder , relative_folder_path ),dirs_exist_ok = True )
152
130
continue
153
131
154
132
for file in files :
155
133
input_file_path = os .path .join (root , file )
156
134
relative_path = os .path .relpath (input_file_path , input_folder )
157
- output_file_path = os .path .join (output_folder , relative_path + ".translated" )
158
135
if file .endswith ((".md" , ".mdx" )):
159
- os .makedirs (os .path .dirname (output_file_path ), exist_ok = True )
160
-
161
136
# Skip files that are in the excluded files set
162
- if relative_path in EXCLUDED_FILES :
163
- print (f" - Skipping due to exclusion target: { input_file_path } " )
137
+ if relative_path in TRANSLATE_EXCLUDED_FILES :
138
+ output_file_path = os .path .join (output_folder , relative_path )
139
+ print (f"Skipping translation due to exclusion target: { input_file_path } " )
140
+ shutil .copy (input_file_path , output_file_path )
164
141
continue
165
-
166
142
# Skip files that already have the translated suffix - allows continuing from failed point
167
- if file .endswith (".translated" ):
143
+ if os .path .exists (os .path .join (output_folder , relative_path + ".translated" )):
144
+ print (f"Skipping ${ input_file_path } translation due to already translated file" )
168
145
continue
169
-
146
+ # re-do files partially through translation
147
+ if os .path .exists (os .path .join (output_folder , relative_path + ".translate" )):
148
+ os .remove (os .path .join (output_folder , relative_path + ".translate" ))
149
+ output_file_path = os .path .join (output_folder , relative_path + ".translate" )
150
+ os .makedirs (os .path .dirname (output_file_path ), exist_ok = True )
170
151
# Submit the translation task to be run in parallel
171
152
futures .append (executor .submit (translate_file , config , input_file_path , output_file_path , model ))
172
153
else :
173
154
# symlink these files as we want to update in a single place
174
155
try :
156
+ output_file_path = os .path .join (output_folder , relative_path )
175
157
if os .path .exists (output_file_path ) or os .path .islink (output_file_path ):
176
158
os .remove (output_file_path ) # Remove existing file/link before creating symlink
159
+ os .makedirs (os .path .dirname (output_file_path ), exist_ok = True )
177
160
os .symlink (input_file_path , output_file_path )
178
161
print (f" - Created symlink: { output_file_path } -> { input_file_path } " )
179
162
except OSError as e :
@@ -183,6 +166,7 @@ def translate_folder(config, input_folder, output_folder, model="gpt-4o-mini"):
183
166
for future in futures :
184
167
future .result ()
185
168
169
+
186
170
def rename_translated_files (output_folder ):
187
171
for root , _ , files in os .walk (output_folder ):
188
172
for file in files :
@@ -201,7 +185,8 @@ def rename_translated_files(output_folder):
201
185
202
186
203
187
def translate_plugin_data (output_folder , config , model = "gpt-4o-mini" ):
204
- json_files = glob .glob (os .path .join (output_folder , "*.json" )) + glob .glob (os .path .join (output_folder , "*" , "*.json" ))
188
+ json_files = glob .glob (os .path .join (output_folder , "*.json" )) + glob .glob (
189
+ os .path .join (output_folder , "*" , "*.json" ))
205
190
language = config ["language" ]
206
191
glossary = config ["glossary" ]
207
192
prompt = f"""
@@ -223,7 +208,7 @@ def translate_plugin_data(output_folder, config, model="gpt-4o-mini"):
223
208
{"role" : "system" , "content" : prompt_content },
224
209
{"role" : "user" , "content" : json .dumps (text )},
225
210
],
226
- response_format = { "type" : "json_object" }
211
+ response_format = {"type" : "json_object" }
227
212
)
228
213
translated_text = completion .choices [0 ].message .content
229
214
translated_config = json .loads (translated_text )
@@ -234,6 +219,7 @@ def translate_plugin_data(output_folder, config, model="gpt-4o-mini"):
234
219
print (f"failed to translate: { e } " )
235
220
raise e
236
221
222
+
237
223
script_dir = os .path .dirname (os .path .abspath (__file__ ))
238
224
default_input_folder = os .path .abspath (os .path .join (script_dir , "../../docs/" ))
239
225
@@ -252,8 +238,9 @@ def main():
252
238
parser .add_argument ("--model" , default = "gpt-4o-mini" , help = "Specify the OpenAI model to use for translation" )
253
239
args = parser .parse_args ()
254
240
config = load_config (args .config )
255
- translate_plugin_data (args .output_folder , config , model = args .model )
256
- translate_folder (config , args .input_folder , args .output_folder , args .model )
241
+ # translate_plugin_data(args.output_folder, config, model=args.model)
242
+ translate_docs_folder (config , args .input_folder ,
243
+ os .path .join (args .output_folder + "/docusaurus-plugin-content-docs/current" ), args .model )
257
244
rename_translated_files (args .output_folder )
258
245
259
246
0 commit comments