-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtranslate.ts
More file actions
459 lines (377 loc) · 14.5 KB
/
translate.ts
File metadata and controls
459 lines (377 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
import crypto from 'node:crypto'
import fs from 'node:fs/promises'
import path from 'node:path'
import { setTimeout } from 'node:timers/promises'
import { isDeepStrictEqual } from 'node:util'
import { removeLeadingSlash } from '@rspress/shared'
import { logger } from '@rspress/shared/logger'
import { Command } from 'commander'
import { render } from 'ejs'
import { merge } from 'es-toolkit/compat'
import matter from 'gray-matter'
import { AzureOpenAI, RateLimitError } from 'openai'
import { pRateLimit } from 'p-ratelimit'
import { glob } from 'tinyglobby'
import { cyan, red } from 'yoctocolors'
import {
mdProcessor,
mdxProcessor,
normalizeImgSrc,
type NormalizeImgSrcOptions,
} from '../plugins/index.js'
import { Language, SUPPORTED_LANGUAGES } from '../shared/index.js'
import type { GlobalCliOptions, TranslateOptions } from '../types.js'
import { pathExists } from '../utils/index.js'
import {
escapeMarkdownHeadingIds,
getMatchedDocFilePaths,
parseBoolean,
parseTerms,
} from './helpers.js'
import { loadConfig } from './load-config.js'
export interface I18nFrontmatter {
i18n?: {
title?: Record<string, string>
additionalPrompts?: string
disableAutoTranslation?: boolean
}
sourceSHA?: string
title?: string
}
export const TERMS_SUPPORTED_LANGUAGES: Language[] = ['en', 'zh']
const DEFAULT_SYSTEM_PROMPT = `
## 角色
你是一位专业的技术文档工程师,擅长写作高质量的<%= targetLang %>技术分档。请你帮我准确地将以下<%= sourceLang %>翻译成<%= targetLang %>,风格与<%= targetLang %>技术文档保持一致。
## 规则
- 第一条消息为需要翻译的最新<%= sourceLang %>文档,第二条消息为之前翻译过的但内容可能过期的<%= targetLang %>文档,如果没有翻译过则为空
- 输入格式为 MDX 格式,输出格式也必须保留原始 MDX 格式,不要翻译其中的 jsx 组件名称,如 <Overview />,且不要额外包装在不必要的代码块中
- 文档中的资源链接不要翻译和替换
- MDX 组件中包含的内容需要翻译,MDX 组件名和参数值不需要翻译,但特殊的 MDX 组件参数值需要翻译,示例:
- <Overview /> 中的 Overview 是组件名,不用翻译
- <Tab label="value">组件包含的内容</Tab>,label 是 key 不用翻译,"value" 是参数值需要翻译
<%= terms %>
- 如果存在下列注释,请保留不用翻译,更不要修改注释内容
- {/* release-notes-for-bugs */}
- <!-- release-notes-for-bugs -->
- 如果存在下列注释,请整体移除不要保留
- {/* reference-start */}
- {/* reference-end */}
- <!-- reference-start -->
- <!-- reference-end -->
- 翻译过程中务必保留原文中的 \\< 和 \\{ 转义字符不要做任何转义变更
- 翻译过程中不要破坏原有的 Markdown 格式,如 frontmatter, 代码块、列表、表格等,其中 frontmatter.ii8n 的内容不用做任何翻译,只需要原样返回即可
## 策略
分四步进行翻译工作:
1. 根据<%= sourceLang %>文档直译成<%= targetLang %>,保持原有格式,不要遗漏任何信息
2. 根据第一步直译的结果,指出其中存在的具体问题,要准确描述,不宜笼统的表示,也不要增加原文不存在的内容或格式,包括不仅限于
- 不符合<%= targetLang %>表达习惯,明确指出不符合的地方
- 语句不通顺,指出位置,不需要给出修改意见,意译时修复
- 晦涩难懂,模棱两可,不易理解,可以尝试给出解释
3. 根据第一步直译的结果和第二步指出的问题,重新进行意译,保证内容的原意的基础上,使其更易于理解,更符合<%= targetLang %>技术文档的表达习惯,同时保持原有的格式不变
4. 当存在之前翻译过的<%= targetLang %>文档时,将第三步的结果分段与之前的<%= targetLang %>文档细致地比较,不要遗漏任何新的分段(包括文本、代码块、图片、超链接等等),如果分段内翻译结果意思相近,仅仅表达方式不同的,且没有新增任何内容时,则该分段只需要保持之前翻译过的内容即可,不需要重复翻译
最终只需要完整输出最后一步的结果,不需要输出任何提及提示词或之前步骤的内容,也不要只返回新增的内容。
<%= userPrompt %>
<%= additionalPrompts %>
`.trim()
let openai: AzureOpenAI | undefined
export interface InternalTranslateOptions extends TranslateOptions {
source: Language
sourceContent: string
target: Language
targetContent?: string
additionalPrompts?: string
}
const resolveTerms_ = async () => {
const parsedTerms = await parseTerms()
return (
'- 以下是常见的相关术语词汇对应表(English <-> 中文)\n' +
parsedTerms.map(({ en, zh = en }) => ` * ${en} <-> ${zh}`).join('\n')
)
}
let termsCache: Promise<string> | undefined
const resolveTerms = async () => {
if (termsCache) {
return termsCache
}
return (termsCache = resolveTerms_().then((terms) => {
logger.debug('Resolved terms:', terms)
return terms
}))
}
export const translate = async ({
source,
sourceContent,
target,
targetContent = '',
systemPrompt,
userPrompt = '',
additionalPrompts = '',
}: InternalTranslateOptions) => {
if (!openai) {
openai = new AzureOpenAI({
endpoint:
process.env.AZURE_OPENAI_ENDPOINT ||
'https://azure-ai-api-gateway.alauda.cn',
apiKey: process.env.AZURE_OPENAI_API_KEY,
apiVersion: process.env.OPENAI_API_VERSION || '2025-03-01-preview',
})
}
const sourceLang = Language[source]
const targetLang = Language[target]
let terms = ''
if (
[source, target].every((lang) => TERMS_SUPPORTED_LANGUAGES.includes(lang))
) {
terms = await resolveTerms()
}
const finalSystemPrompt = await render(
systemPrompt?.trim() || DEFAULT_SYSTEM_PROMPT,
{ sourceLang, targetLang, userPrompt, additionalPrompts, terms },
{ async: true },
)
const { choices } = await openai.chat.completions.parse({
messages: [
{
role: 'system',
content: finalSystemPrompt,
},
{
role: 'user',
content: sourceContent,
},
{
role: 'user',
content: targetContent,
},
],
model: 'gpt-4o-mini',
temperature: 0.2,
})
const { content, refusal } = choices[0].message
if (refusal) {
throw new Error(refusal)
}
return content!
}
const limit = pRateLimit({
interval: 60_000, // 1min
rate: 50,
concurrency: 10,
})
export interface TranslateCommandOptions {
source: Language
target: Language
glob: string[]
copy?: boolean
}
const supportedLanguages = SUPPORTED_LANGUAGES.join(', ')
export const translateCommand = new Command('translate')
.description('Translate the documentation')
.argument('[root]', 'Root directory of the documentation')
.option(
'-s, --source <language>',
`Document source language, one of ${supportedLanguages}`,
'en',
)
.option(
'-t, --target <language>',
`Document target language, one of ${supportedLanguages}`,
'zh',
)
.requiredOption('-g, --glob <path...>', 'Glob patterns for source dirs/files')
.option(
'-C, --copy [boolean]',
'Wether to copy relative assets to the target directory instead of following links',
parseBoolean,
false,
)
.action(async function (root?: string) {
const {
source,
target,
glob: globs,
copy,
force,
...globalOptions
} = this.optsWithGlobals<TranslateCommandOptions & GlobalCliOptions>()
if (
!Object.hasOwn(Language, source) ||
!Object.hasOwn(Language, target) ||
source === target
) {
logger.error(
`Translate from language \`${cyan(source)}\` to \`${cyan(target)}\` is not supported.`,
)
process.exitCode = 1
return
}
const { config } = await loadConfig(root, globalOptions)
const docsDir = config.root!
const sourceDir = path.resolve(docsDir, source)
const targetDir = path.resolve(docsDir, target)
if (!(await pathExists(sourceDir, 'directory'))) {
logger.error(`The directory "${cyan(sourceDir)}" does not exist.`)
process.exitCode = 1
return
}
const sourceMatched = await glob(globs.map(removeLeadingSlash), {
absolute: true,
cwd: sourceDir,
onlyFiles: false,
})
const sourceFilePaths = await getMatchedDocFilePaths(sourceMatched)
const allSourceFilePaths = new Set(sourceFilePaths.flat())
const internalFilePaths = await glob(config.internalRoutes || [], {
absolute: true,
cwd: docsDir,
})
for (const internalFilePath of internalFilePaths) {
allSourceFilePaths.delete(internalFilePath)
}
if (allSourceFilePaths.size === 0) {
logger.error(
`No files matched by the glob patterns: ${globs.map((g) => `\`${cyan(g)}\``).join(', ')}`,
)
process.exitCode = 1
return
}
if (isDeepStrictEqual(globs, ['*'])) {
logger.warn(
`You're running in a special mode, all files except \`${cyan('internalRoutes')}\` will be translated, and all ${red('unmatched')} target files will be ${red('removed')}.`,
)
const targetMatched = await glob(globs.map(removeLeadingSlash), {
absolute: true,
cwd: targetDir,
onlyFiles: false,
})
const targetFilePaths = await getMatchedDocFilePaths(targetMatched)
const allTargetFilePaths = new Set(targetFilePaths.flat())
for (const internalFilePath of internalFilePaths) {
allTargetFilePaths.delete(internalFilePath)
}
const toRemoveTargetFilePaths: string[] = []
for (const targetFilePath of allTargetFilePaths) {
const targetRelativePath = path.relative(targetDir, targetFilePath)
const sourceFilePath = path.resolve(sourceDir, targetRelativePath)
if (!allSourceFilePaths.has(sourceFilePath)) {
toRemoveTargetFilePaths.push(targetFilePath)
}
}
if (toRemoveTargetFilePaths.length > 0) {
logger.warn(
'Found unmatched target files will be removed:\n' +
toRemoveTargetFilePaths.map((file) => `- ${red(file)}`).join('\n'),
)
await Promise.all(toRemoveTargetFilePaths.map((file) => fs.rm(file)))
}
}
const executor = async () =>
await Promise.all(
[...allSourceFilePaths].map(async (sourceFilePath) => {
const sourceContent = await fs.readFile(sourceFilePath, 'utf-8')
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const { sourceSHA: _sourceSHA, ...sourceFrontmatter } = matter(
sourceContent,
).data as I18nFrontmatter
if (sourceFrontmatter.i18n?.disableAutoTranslation) {
allSourceFilePaths.delete(sourceFilePath)
return
}
const sourceSHA = crypto
.createHash('sha256')
.update(sourceContent)
.digest('hex')
const targetFilePath = sourceFilePath.replace(sourceDir, targetDir)
let targetContent: string | undefined
let targetFrontmatter: I18nFrontmatter | undefined
if (await pathExists(targetFilePath, 'file')) {
targetContent = await fs.readFile(targetFilePath, 'utf-8')
targetFrontmatter = matter(targetContent).data
if (
targetFrontmatter.i18n?.disableAutoTranslation ||
(!force && targetFrontmatter.sourceSHA === sourceSHA)
) {
allSourceFilePaths.delete(sourceFilePath)
return
}
}
await limit(async () => {
const sourceRelativePath = path.relative(docsDir, sourceFilePath)
const targetRelativePath = path.relative(docsDir, targetFilePath)
logger.info(
`Translating ${cyan(sourceRelativePath)} to ${cyan(targetRelativePath)}`,
)
const isMdx = sourceFilePath.endsWith('.mdx')
const processor = isMdx ? mdxProcessor : mdProcessor
const ast = processor.parse(escapeMarkdownHeadingIds(sourceContent))
const targetBase = path.dirname(targetFilePath)
const normalizeImgSrcOptions: NormalizeImgSrcOptions = {
localPublicBase: path.resolve(docsDir, 'public'),
sourceBase: path.dirname(sourceFilePath),
targetBase,
translating: { source, target, copy },
}
const normalizedSourceContent = processor.stringify({
...ast,
children: ast.children.map((it) =>
normalizeImgSrc(it, normalizeImgSrcOptions),
),
})
targetContent = await translate({
...config.translate,
source,
sourceContent: normalizedSourceContent,
target,
targetContent: force ? '' : targetContent,
additionalPrompts:
targetFrontmatter?.i18n?.additionalPrompts ??
sourceFrontmatter.i18n?.additionalPrompts,
})
const { data, content } = matter(targetContent)
const newFrontmatter = merge(
{},
sourceFrontmatter,
targetFrontmatter,
data,
)
newFrontmatter.sourceSHA = sourceSHA
if (sourceFrontmatter.i18n?.title?.[target]) {
newFrontmatter.title = sourceFrontmatter.i18n.title[target]
}
if (typeof newFrontmatter.title !== 'string') {
delete newFrontmatter.title
}
targetContent = matter.stringify(
content.startsWith('\n') ? content : '\n' + content,
newFrontmatter,
)
await fs.mkdir(targetBase, { recursive: true })
await fs.writeFile(targetFilePath, targetContent)
logger.info(
`${cyan(sourceRelativePath)} translated to ${cyan(targetRelativePath)}`,
)
allSourceFilePaths.delete(sourceFilePath)
})
}),
)
let retry = 0
while (retry < 15) {
try {
await executor()
return
} catch (error) {
if (error instanceof RateLimitError) {
const retryAfter =
Number(error.headers.get('retry-after')) || 60 * ++retry
logger.warn(`Rate limit exceeded, retrying in ${retryAfter}s...`)
await setTimeout(retryAfter)
continue
}
throw error
}
}
logger.error(
`Failed to translate after ${retry} retries, please try again later.`,
)
process.exitCode = 1
})