Skip to content

Commit bb38a14

Browse files
youyongsongJounQin
andauthored
feat: add chunk-based translation for large files to handle TPM limits (#92)
Signed-off-by: JounQin <[email protected]> Co-authored-by: JounQin <[email protected]>
1 parent b4d2bdb commit bb38a14

File tree

2 files changed

+82
-1
lines changed

2 files changed

+82
-1
lines changed

.changeset/selfish-worms-help.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@alauda/doom": minor
3+
---
4+
5+
feat: add chunk-based translation for large files to handle TPM limits

src/cli/translate.ts

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ You are a professional technical documentation engineer, skilled in writing high
9292
<% if (terms) { %>
9393
<%- terms %>
9494
<% } %>
95+
<% if (isChunk) { %>
96+
## Chunk Translation Notice
97+
This is part of a larger document that has been split into smaller chunks for translation. Please translate this chunk as if it's part of a continuous document, maintaining consistency with the overall document style and context.
98+
<% } %>
9599
96100
<% if (userPrompt || additionalPrompts) { %>
97101
## Additional Requirements
@@ -117,6 +121,7 @@ export interface InternalTranslateOptions extends TranslateOptions {
117121
sourceContent: string
118122
target: Language
119123
additionalPrompts?: string
124+
isChunk?: boolean
120125
}
121126

122127
const resolveTerms = async (
@@ -220,13 +225,83 @@ function getTitleTranslation(
220225
return null
221226
}
222227

228+
function splitContentIntoChunks(
229+
content: string,
230+
maxChunkSize: number,
231+
): string[] {
232+
const lines = content.split('\n')
233+
const chunks: string[] = []
234+
let currentChunk: string[] = []
235+
let currentSize = 0
236+
237+
for (const line of lines) {
238+
const lineSize = Buffer.byteLength(line + '\n', 'utf8')
239+
240+
// If adding this line would exceed the chunk size, and we have content in current chunk
241+
if (currentSize + lineSize > maxChunkSize && currentChunk.length > 0) {
242+
chunks.push(currentChunk.join('\n'))
243+
currentChunk = [line]
244+
currentSize = lineSize
245+
} else {
246+
currentChunk.push(line)
247+
currentSize += lineSize
248+
}
249+
}
250+
251+
// Add the last chunk if it has content
252+
if (currentChunk.length > 0) {
253+
chunks.push(currentChunk.join('\n'))
254+
}
255+
256+
return chunks
257+
}
258+
259+
export const translateWithChunks = async (
260+
options: InternalTranslateOptions,
261+
): Promise<string> => {
262+
const { sourceContent } = options
263+
const maxChunkSize = 60 * 1024
264+
265+
const contentSize = Buffer.byteLength(sourceContent, 'utf8')
266+
if (contentSize <= maxChunkSize) {
267+
return translate(options)
268+
}
269+
270+
logger.info(
271+
`Content size (${Math.round(contentSize / 1024)}KB) exceeds limit, splitting into chunks...`,
272+
)
273+
274+
const chunks = splitContentIntoChunks(sourceContent, maxChunkSize)
275+
logger.info(`Split content into ${chunks.length} chunks`)
276+
277+
const translatedChunks: string[] = []
278+
279+
for (let i = 0; i < chunks.length; i++) {
280+
logger.info(`Translating chunk ${i + 1}/${chunks.length}...`)
281+
282+
const translatedChunk = await translate({
283+
...options,
284+
sourceContent: chunks[i],
285+
isChunk: true,
286+
})
287+
288+
translatedChunks.push(translatedChunk)
289+
}
290+
291+
const result = translatedChunks.join('\n')
292+
logger.info(`Successfully translated ${chunks.length} chunks`)
293+
294+
return result
295+
}
296+
223297
export const translate = async ({
224298
source,
225299
sourceContent,
226300
target,
227301
systemPrompt,
228302
userPrompt = '',
229303
additionalPrompts = '',
304+
isChunk = false,
230305
}: InternalTranslateOptions) => {
231306
if (!openai) {
232307
openai = new AzureOpenAI({
@@ -275,6 +350,7 @@ export const translate = async ({
275350
additionalPrompts: additionalPrompts,
276351
terms,
277352
titleTranslationPrompt,
353+
isChunk,
278354
},
279355
{ async: true },
280356
)
@@ -543,7 +619,7 @@ export const translateCommand = new Command('translate')
543619
),
544620
})
545621

546-
targetContent = await translate({
622+
targetContent = await translateWithChunks({
547623
...config.translate,
548624
source,
549625
sourceContent: normalizedSourceContent,

0 commit comments

Comments
 (0)