Skip to content

Commit f994790

Browse files
committed
refactored recurTranslate and split functionalities to different files
1 parent ba25b24 commit f994790

File tree

5 files changed

+420
-374
lines changed

5 files changed

+420
-374
lines changed

i18n/config.ts

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,19 @@
11
// maximum permissible concurrent translations
2-
const max_trans_num = Number(process.env.MAX_TRANSLATION_NO) || 5;
2+
export const max_trans_num = Number(process.env.MAX_TRANSLATION_NO) || 5;
33

44
// log file configs
5-
const translationSummaryPrefix = "translation-summary";
6-
const jsonSummaryPrefix = "json-summary";
7-
8-
export {max_trans_num, translationSummaryPrefix, jsonSummaryPrefix}
5+
export const translationSummaryPrefix: string = "translation-summary";
6+
export const jsonSummaryPrefix: string = "json-summary";
7+
export const ignoredTags: string[] = [
8+
"LATEXINLINE",
9+
"LATEX",
10+
"SNIPPET",
11+
"SCHEMEINLINE",
12+
"SCHEME",
13+
"LONG_PAGE",
14+
"LABEL",
15+
"HISTORY",
16+
"REF",
17+
"FIGURE",
18+
];
19+
export const max_chunk_len: Number = Number(process.env.MAX_LEN) || 3000;

i18n/controllers/parsers.ts

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
import sax from "sax";
2+
import { escapeXML, formatAttributes, strongEscapeXML } from "./xmlUtilities";
3+
import { Readable } from "stream";
4+
import { ignoredTags, max_chunk_len } from "../config";
5+
import fs, { PathLike } from "fs";
6+
import { FileLike } from "openai/uploads.mjs";
7+
8+
const MAXLEN = max_chunk_len;
9+
const createParser = () =>
10+
(sax as any).createStream(false, { trim: false }, { strictEntities: true });
11+
12+
export async function cleanParser(
13+
text: string,
14+
filePath: string,
15+
logError: Function
16+
): Promise<string> {
17+
let translatedChunk = "";
18+
const safeText = escapeXML(text);
19+
const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>");
20+
await new Promise<void>((resolve, reject) => {
21+
// Create a SAX parser in strict mode for cleaning up translations.
22+
const clean = createParser();
23+
24+
// SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
25+
let currDepth = -1;
26+
27+
clean.on("text", text => {
28+
if (currDepth >= 1) {
29+
translatedChunk += strongEscapeXML(text);
30+
}
31+
});
32+
33+
clean.on("opentag", node => {
34+
currDepth++;
35+
if (node.name != "WRAPPER" && node.name != "TRANSLATE") {
36+
translatedChunk += `<${node.name}${formatAttributes(node.attributes)}>`;
37+
}
38+
});
39+
40+
clean.on("closetag", tagName => {
41+
if (tagName != "WRAPPER" && tagName != "TRANSLATE") {
42+
translatedChunk += `</${tagName}>`;
43+
}
44+
currDepth--;
45+
});
46+
47+
clean.on("cdata", cdata => {
48+
translatedChunk += `<![CDATA[${cdata}]]>`;
49+
});
50+
51+
clean.on("comment", comment => {
52+
translatedChunk += `<!-- ${comment} -->`;
53+
});
54+
55+
clean.on("error", error => {
56+
// Log only once with abbreviated content
57+
logError(`Error validating AI response for ${filePath}`, error, filePath);
58+
59+
// Attempt to recover using the internal parser
60+
try {
61+
clean._parser.error = null;
62+
clean._parser.resume();
63+
// Continue processing despite the error
64+
resolve();
65+
} catch (e) {
66+
// Add error comment and resolve instead of rejecting
67+
translatedChunk += `<!-- XML validation error -->`;
68+
resolve();
69+
}
70+
});
71+
72+
clean.once("end", resolve);
73+
74+
textStream.pipe(clean);
75+
});
76+
return translatedChunk;
77+
}
78+
79+
export async function splitParser(filePath: PathLike, logError: Function): Promise<[boolean, string][]> {
80+
// Create a SAX parser in strict mode to split source into chunks.
81+
const parser = createParser();
82+
83+
const segments: [boolean, string][] = [];
84+
await new Promise<void>((resolve, reject) => {
85+
// Variables to track current depth and segments.
86+
let currentDepth = 0;
87+
let currentSegment = "";
88+
89+
// In this context:
90+
// - Depth 0: Before any element is opened.
91+
// - Depth 1: The root element (<CHAPTER>).
92+
// - Depth 2: Each direct child of the root that we want to capture.
93+
let isRecording = false;
94+
95+
parser.on("opentag", node => {
96+
currentDepth++;
97+
98+
if (currentDepth === 2 || isRecording) {
99+
isRecording = true;
100+
currentSegment += `<${node.name}${formatAttributes(node.attributes)}>`;
101+
} else {
102+
segments.push([
103+
false,
104+
`<${node.name}${formatAttributes(node.attributes)}>`
105+
]);
106+
}
107+
});
108+
109+
parser.on("text", text => {
110+
text = strongEscapeXML(text);
111+
112+
if (isRecording) {
113+
currentSegment += text;
114+
} else {
115+
segments.push([false, text]);
116+
}
117+
});
118+
119+
parser.on("cdata", cdata => {
120+
if (isRecording) {
121+
currentSegment += `<![CDATA[${cdata}]]>`;
122+
}
123+
});
124+
125+
parser.on("closetag", tagName => {
126+
if (isRecording) {
127+
currentSegment += `</${tagName}>`;
128+
}
129+
130+
if (currentDepth === 2) {
131+
isRecording = false;
132+
// We are closing a segment element.
133+
if (ignoredTags.includes(tagName)) {
134+
segments.push([false, currentSegment]);
135+
} else {
136+
if (
137+
segments.length > 0 &&
138+
segments[segments.length - 1][0] &&
139+
segments[segments.length - 1][1].length + currentSegment.length <
140+
Number(MAXLEN)
141+
) {
142+
segments[segments.length - 1][1] += currentSegment;
143+
} else {
144+
segments.push([true, currentSegment]);
145+
}
146+
}
147+
currentSegment = "";
148+
}
149+
150+
if (currentDepth === 1) {
151+
// We are closing the root element.
152+
segments.push([false, `</${tagName}>`]);
153+
}
154+
155+
currentDepth--;
156+
});
157+
158+
parser.on("comment", comment => {
159+
if (isRecording) {
160+
currentSegment += `<!-- ${comment} -->`;
161+
} else {
162+
segments.push([false, `<!-- ${comment} -->`]);
163+
}
164+
});
165+
166+
parser.on("end", async () => {
167+
resolve();
168+
});
169+
170+
parser.on("error", err => {
171+
logError(`Parser error in ${filePath}:`, err, filePath);
172+
// Try to recover and continue
173+
try {
174+
parser._parser.error = null;
175+
parser._parser.resume();
176+
} catch (resumeErr) {
177+
logError(`Could not recover from parser error:`, resumeErr, filePath);
178+
reject(err);
179+
}
180+
});
181+
182+
// Use the file path directly without modification
183+
fs.createReadStream(filePath).pipe(parser);
184+
});
185+
186+
return segments;
187+
}
188+
189+
export async function recurSplitParser(ori: string, filePath: PathLike, logError: Function): Promise<string[]> {
190+
let subTranslated: string[] = [];
191+
// continue splitting the chunk
192+
// Create a SAX parser in strict mode to split source into chunks.
193+
await new Promise<void>((resolve, reject) => {
194+
const subParser = createParser();
195+
196+
let subCurrentDepth = 0;
197+
let subCurrentSegment = "";
198+
const subSegments: [boolean, string][] = [];
199+
let subIsRecording = false;
200+
201+
subParser.on("opentag", node => {
202+
if (node.name === "WRAPPER") return;
203+
204+
subCurrentDepth++;
205+
206+
if (subCurrentDepth === 2) subIsRecording = true;
207+
208+
if (subIsRecording) {
209+
subCurrentSegment += `<${node.name}${formatAttributes(node.attributes)}>`;
210+
} else {
211+
subSegments.push([
212+
false,
213+
`<${node.name}${formatAttributes(node.attributes)}>`
214+
]);
215+
}
216+
});
217+
218+
subParser.on("text", text => {
219+
text = strongEscapeXML(text);
220+
if (subIsRecording) {
221+
subCurrentSegment += text;
222+
} else if (
223+
subSegments.length > 0 &&
224+
subSegments[subSegments.length - 1][0]
225+
) {
226+
subSegments[subSegments.length - 1][1] += text;
227+
} else if (
228+
text.trim() === "" ||
229+
text.trim() === "," ||
230+
text.trim() === "."
231+
) {
232+
subSegments.push([false, text]);
233+
} else {
234+
subSegments.push([true, text]);
235+
}
236+
});
237+
238+
subParser.on("cdata", cdata => {
239+
if (subIsRecording) {
240+
subCurrentSegment += `<![CDATA[${cdata}]]>`;
241+
}
242+
});
243+
244+
subParser.on("closetag", tagName => {
245+
if (tagName === "WRAPPER") {
246+
return;
247+
}
248+
249+
subCurrentSegment += `</${tagName}>`;
250+
251+
if (subCurrentDepth === 2) {
252+
// We are closing a segment element.
253+
if (ignoredTags.includes(tagName)) {
254+
subSegments.push([false, subCurrentSegment]);
255+
} else if (
256+
subSegments.length > 0 &&
257+
subSegments[subSegments.length - 1][0] &&
258+
subSegments[subSegments.length - 1][1].length +
259+
subCurrentSegment.length <
260+
Number(MAXLEN)
261+
) {
262+
subSegments[subSegments.length - 1][1] += subCurrentSegment;
263+
} else {
264+
subSegments.push([true, subCurrentSegment]);
265+
}
266+
subCurrentSegment = "";
267+
subIsRecording = false;
268+
}
269+
270+
if (subCurrentDepth === 1) {
271+
subSegments.push([false, `</${tagName}>`]);
272+
subCurrentSegment = "";
273+
}
274+
275+
subCurrentDepth--;
276+
});
277+
278+
subParser.on("comment", comment => {
279+
if (subIsRecording) {
280+
subCurrentSegment += `<!-- ${comment} -->`;
281+
} else {
282+
subSegments.push([false, `<!-- ${comment} -->`]);
283+
}
284+
});
285+
286+
subParser.on("end", async () =>
287+
resolve()
288+
);
289+
290+
subParser.on("error", err => {
291+
logError(`Error in subParser for ${filePath}:`, err, filePath);
292+
// Try to recover and continue
293+
try {
294+
subParser._parser.error = null;
295+
subParser._parser.resume();
296+
} catch (resumeErr) {
297+
logError(`Could not recover from parser error:`, resumeErr, filePath);
298+
reject(err);
299+
}
300+
});
301+
302+
Readable.from("<WRAPPER>" + ori + "</WRAPPER>").pipe(subParser);
303+
});
304+
305+
return subTranslated;
306+
}

0 commit comments

Comments
 (0)