Skip to content

Commit a115312

Browse files
committed
Refactor translation logic to simplify segment handling and improve readability
1 parent d110429 commit a115312

File tree

1 file changed

+59
-66
lines changed

1 file changed

+59
-66
lines changed

i18n/controllers/recurTranslate.ts

Lines changed: 59 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import dotenv from "dotenv";
66
import sax from "sax";
77
import { Readable } from "stream";
88
import { fileURLToPath } from "url";
9+
import { isGeneratorObject } from "util/types";
910

1011
dotenv.config();
1112

@@ -19,6 +20,8 @@ const ai = new OpenAI({
1920
baseURL: process.env.AI_BASEURL
2021
});
2122

23+
const ignoredTags = ["LATEXINLINE", "LATEX", "SNIPPET", "SCHEMEINLINE", "SCHEME", "LONG_PAGE", "LABEL"];
24+
2225
const MAXLEN = Number(process.env.MAX_LEN) || 3000;
2326

2427
const createParser = () =>
@@ -58,8 +61,8 @@ async function recursivelyTranslate(
5861
path: string
5962
): Promise<string> {
6063
// Recursive function to split and translate
61-
async function helper(ori: string, force: boolean): Promise<string> {
62-
if (ori.length < MAXLEN && !force) {
64+
async function helper(ori: string): Promise<string> {
65+
if (ori.length < MAXLEN) {
6366
return await translateChunk(ori); // translate the chunk
6467
}
6568

@@ -69,21 +72,19 @@ async function recursivelyTranslate(
6972
await new Promise<void>((resolve, reject) => {
7073
const subParser = createParser();
7174

72-
let subCurrentDepth = 1;
75+
let subCurrentDepth = 0;
7376
let subCurrentSegment = "";
7477
const subSegments: [boolean, string][] = [];
7578
let subIsRecording = false;
7679

7780
subParser.on("opentag", node => {
78-
if (node.name === "WRAPPER") {
79-
return;
80-
}
81-
81+
if (node.name === "WRAPPER") return;
82+
8283
subCurrentDepth++;
8384

84-
// If we're at depth 2, this is the start of a new segment.
85-
if (subCurrentDepth === 2 || subIsRecording) {
86-
subIsRecording = true;
85+
if (subCurrentDepth === 2) subIsRecording = true;
86+
87+
if (subIsRecording) {
8788
subCurrentSegment += `<${node.name}${formatAttributes(node.attributes)}>`;
8889
} else {
8990
subSegments.push([
@@ -97,21 +98,19 @@ async function recursivelyTranslate(
9798
text = strongEscapeXML(text);
9899
if (subIsRecording) {
99100
subCurrentSegment += text;
101+
} else if (
102+
subSegments.length > 0 &&
103+
subSegments[subSegments.length - 1][0]
104+
) {
105+
subSegments[subSegments.length - 1][1] += text;
106+
} else if (
107+
text.trim() === "" ||
108+
text.trim() === "," ||
109+
text.trim() === "."
110+
) {
111+
subSegments.push([false, text]);
100112
} else {
101-
if (
102-
subSegments.length > 0 &&
103-
subSegments[subSegments.length - 1][0]
104-
) {
105-
subSegments[subSegments.length - 1][1] += text;
106-
} else if (
107-
text.trim() !== "" ||
108-
text.trim() === "," ||
109-
text.trim() === "."
110-
) {
111-
subSegments.push([false, text]);
112-
} else {
113-
subSegments.push([true, text]);
114-
}
113+
subSegments.push([true, text]);
115114
}
116115
});
117116

@@ -125,41 +124,35 @@ async function recursivelyTranslate(
125124
if (tagName === "WRAPPER") {
126125
return;
127126
}
128-
129-
if (subIsRecording) {
130-
subCurrentSegment += `</${tagName}>`;
131-
}
127+
128+
subCurrentSegment += `</${tagName}>`;
132129

133130
if (subCurrentDepth === 2) {
134131
// We are closing a segment element.
135132
if (
136-
tagName === "LATEXINLINE" ||
137-
tagName === "LATEX" ||
138-
tagName === "SNIPPET" ||
139-
tagName === "SCHEMEINLINE"
133+
ignoredTags.includes(tagName)
140134
) {
141135
subSegments.push([false, subCurrentSegment]);
136+
} else if (
137+
subSegments.length > 0 &&
138+
subSegments[subSegments.length - 1][0] &&
139+
subSegments[subSegments.length - 1][1].length +
140+
subCurrentSegment.length <
141+
MAXLEN
142+
) {
143+
subSegments[subSegments.length - 1][1] += subCurrentSegment;
142144
} else {
143-
if (
144-
subSegments.length > 0 &&
145-
subSegments[subSegments.length - 1][0] &&
146-
(subSegments[subSegments.length - 1][1].length +
147-
subCurrentSegment.length) <
148-
MAXLEN
149-
) {
150-
subSegments[subSegments.length - 1][1] += subCurrentSegment;
151-
} else {
152145
subSegments.push([true, subCurrentSegment]);
153-
}
154146
}
155147
subCurrentSegment = "";
156148
subIsRecording = false;
157149
}
158-
150+
159151
if (subCurrentDepth === 1) {
160-
// We are closing the root element.
161-
subSegments.push([false, `</${tagName}>`]);
152+
subSegments.push([false, `</${tagName}>`])
153+
subCurrentSegment = "";
162154
}
155+
163156
subCurrentDepth--;
164157
});
165158

@@ -174,7 +167,7 @@ async function recursivelyTranslate(
174167
subParser.on("end", async () => {
175168
for (const segment of subSegments) {
176169
if (segment[0]) {
177-
subTranslated.push(await helper(segment[1], false));
170+
subTranslated.push(await helper(segment[1]));
178171
} else {
179172
subTranslated.push(segment[1]);
180173
}
@@ -248,28 +241,23 @@ async function recursivelyTranslate(
248241
}
249242

250243
if (currentDepth === 2) {
244+
isRecording = false;
251245
// We are closing a segment element.
252-
if (
253-
tagName === "LATEXINLINE" ||
254-
tagName === "LATEX" ||
255-
tagName === "SNIPPET" ||
256-
tagName === "SCHEMEINLINE" ||
257-
tagName === "SCHEME"
258-
) {
246+
if (ignoredTags.includes(tagName)) {
259247
segments.push([false, currentSegment]);
260248
} else {
261249
if (
262250
segments.length > 0 &&
263251
segments[segments.length - 1][0] &&
264-
(segments[segments.length - 1][1].length +
265-
currentSegment.length) <
252+
segments[segments.length - 1][1].length + currentSegment.length <
266253
MAXLEN
267254
) {
268255
segments[segments.length - 1][1] += currentSegment;
269256
} else {
270-
segments.push([true, currentSegment]);
257+
segments.push([true, currentSegment]);
271258
}
272259
}
260+
currentSegment = "";
273261
}
274262

275263
if (currentDepth === 1) {
@@ -291,7 +279,7 @@ async function recursivelyTranslate(
291279
parser.on("end", async () => {
292280
for (const segment of segments) {
293281
if (segment[0]) {
294-
translated.push(await helper(segment[1], false));
282+
translated.push(await helper(segment[1]));
295283
} else {
296284
translated.push(segment[1]);
297285
}
@@ -314,12 +302,12 @@ async function recursivelyTranslate(
314302
if (chunk.trim() === "" || chunk.trim() === "," || chunk.trim() === ".") {
315303
return chunk;
316304
}
317-
305+
318306
// console.log("Translating chunk of length: " + chunk.length);
319-
if (chunk.length < 100) {
320-
console.log("\nchunk: " + chunk)
321-
}
322-
307+
// if (chunk.length < 100) {
308+
// console.log("\nchunk: " + chunk);
309+
// }
310+
323311
let translatedChunk = "";
324312

325313
try {
@@ -332,7 +320,7 @@ async function recursivelyTranslate(
332320
Content to translate:
333321
<TRANSLATE> ${chunk} </TRANSLATE>`
334322
});
335-
323+
336324
const run = await ai.beta.threads.runs.createAndPoll(thread.id, {
337325
assistant_id: assistant_id
338326
});
@@ -353,7 +341,7 @@ async function recursivelyTranslate(
353341
const text = messageContent.text;
354342

355343
const safeText = escapeXML(text.value);
356-
console.log(safeText);
344+
// const safeText = chunk;
357345
const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>");
358346

359347
await new Promise<void>((resolve, reject) => {
@@ -394,13 +382,18 @@ async function recursivelyTranslate(
394382
clean.on("error", error => {
395383
console.log(
396384
"error encountered when validating XML: " +
397-
error + "\nfile: " + path +
385+
error +
386+
"\nfile: " +
387+
path +
398388
"\n section: " +
399-
(safeText.length > 50 ? safeText.substring(0, 100) + "..." : safeText )
389+
safeText +
390+
"\n original text: " +
391+
chunk
400392
);
401393

402394
// Attempt to recover using the internal parser
403395
try {
396+
clean._parser.error = null;
404397
clean._parser.resume();
405398
} catch (e) {
406399
console.log("Failed to resume parser:", e);

0 commit comments

Comments
 (0)