|
| 1 | +import sax from "sax"; |
| 2 | +import { escapeXML, formatAttributes, strongEscapeXML } from "./xmlUtilities"; |
| 3 | +import { Readable } from "stream"; |
| 4 | +import { ignoredTags, max_chunk_len } from "../config"; |
| 5 | +import fs, { PathLike } from "fs"; |
| 6 | +import { FileLike } from "openai/uploads.mjs"; |
| 7 | + |
| 8 | +const MAXLEN = max_chunk_len; |
| 9 | +const createParser = () => |
| 10 | + (sax as any).createStream(false, { trim: false }, { strictEntities: true }); |
| 11 | + |
| 12 | +export async function cleanParser( |
| 13 | + text: string, |
| 14 | + filePath: string, |
| 15 | + logError: Function |
| 16 | +): Promise<string> { |
| 17 | + let translatedChunk = ""; |
| 18 | + const safeText = escapeXML(text); |
| 19 | + const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>"); |
| 20 | + await new Promise<void>((resolve, reject) => { |
| 21 | + // Create a SAX parser in strict mode for cleaning up translations. |
| 22 | + const clean = createParser(); |
| 23 | + |
| 24 | + // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags |
| 25 | + let currDepth = -1; |
| 26 | + |
| 27 | + clean.on("text", text => { |
| 28 | + if (currDepth >= 1) { |
| 29 | + translatedChunk += strongEscapeXML(text); |
| 30 | + } |
| 31 | + }); |
| 32 | + |
| 33 | + clean.on("opentag", node => { |
| 34 | + currDepth++; |
| 35 | + if (node.name != "WRAPPER" && node.name != "TRANSLATE") { |
| 36 | + translatedChunk += `<${node.name}${formatAttributes(node.attributes)}>`; |
| 37 | + } |
| 38 | + }); |
| 39 | + |
| 40 | + clean.on("closetag", tagName => { |
| 41 | + if (tagName != "WRAPPER" && tagName != "TRANSLATE") { |
| 42 | + translatedChunk += `</${tagName}>`; |
| 43 | + } |
| 44 | + currDepth--; |
| 45 | + }); |
| 46 | + |
| 47 | + clean.on("cdata", cdata => { |
| 48 | + translatedChunk += `<![CDATA[${cdata}]]>`; |
| 49 | + }); |
| 50 | + |
| 51 | + clean.on("comment", comment => { |
| 52 | + translatedChunk += `<!-- ${comment} -->`; |
| 53 | + }); |
| 54 | + |
| 55 | + clean.on("error", error => { |
| 56 | + // Log only once with abbreviated content |
| 57 | + logError(`Error validating AI response for ${filePath}`, error, filePath); |
| 58 | + |
| 59 | + // Attempt to recover using the internal parser |
| 60 | + try { |
| 61 | + clean._parser.error = null; |
| 62 | + clean._parser.resume(); |
| 63 | + // Continue processing despite the error |
| 64 | + resolve(); |
| 65 | + } catch (e) { |
| 66 | + // Add error comment and resolve instead of rejecting |
| 67 | + translatedChunk += `<!-- XML validation error -->`; |
| 68 | + resolve(); |
| 69 | + } |
| 70 | + }); |
| 71 | + |
| 72 | + clean.once("end", resolve); |
| 73 | + |
| 74 | + textStream.pipe(clean); |
| 75 | + }); |
| 76 | + return translatedChunk; |
| 77 | +} |
| 78 | + |
| 79 | +export async function splitParser(filePath: PathLike, logError: Function): Promise<[boolean, string][]> { |
| 80 | + // Create a SAX parser in strict mode to split source into chunks. |
| 81 | + const parser = createParser(); |
| 82 | + |
| 83 | + const segments: [boolean, string][] = []; |
| 84 | + await new Promise<void>((resolve, reject) => { |
| 85 | + // Variables to track current depth and segments. |
| 86 | + let currentDepth = 0; |
| 87 | + let currentSegment = ""; |
| 88 | + |
| 89 | + // In this context: |
| 90 | + // - Depth 0: Before any element is opened. |
| 91 | + // - Depth 1: The root element (<CHAPTER>). |
| 92 | + // - Depth 2: Each direct child of the root that we want to capture. |
| 93 | + let isRecording = false; |
| 94 | + |
| 95 | + parser.on("opentag", node => { |
| 96 | + currentDepth++; |
| 97 | + |
| 98 | + if (currentDepth === 2 || isRecording) { |
| 99 | + isRecording = true; |
| 100 | + currentSegment += `<${node.name}${formatAttributes(node.attributes)}>`; |
| 101 | + } else { |
| 102 | + segments.push([ |
| 103 | + false, |
| 104 | + `<${node.name}${formatAttributes(node.attributes)}>` |
| 105 | + ]); |
| 106 | + } |
| 107 | + }); |
| 108 | + |
| 109 | + parser.on("text", text => { |
| 110 | + text = strongEscapeXML(text); |
| 111 | + |
| 112 | + if (isRecording) { |
| 113 | + currentSegment += text; |
| 114 | + } else { |
| 115 | + segments.push([false, text]); |
| 116 | + } |
| 117 | + }); |
| 118 | + |
| 119 | + parser.on("cdata", cdata => { |
| 120 | + if (isRecording) { |
| 121 | + currentSegment += `<![CDATA[${cdata}]]>`; |
| 122 | + } |
| 123 | + }); |
| 124 | + |
| 125 | + parser.on("closetag", tagName => { |
| 126 | + if (isRecording) { |
| 127 | + currentSegment += `</${tagName}>`; |
| 128 | + } |
| 129 | + |
| 130 | + if (currentDepth === 2) { |
| 131 | + isRecording = false; |
| 132 | + // We are closing a segment element. |
| 133 | + if (ignoredTags.includes(tagName)) { |
| 134 | + segments.push([false, currentSegment]); |
| 135 | + } else { |
| 136 | + if ( |
| 137 | + segments.length > 0 && |
| 138 | + segments[segments.length - 1][0] && |
| 139 | + segments[segments.length - 1][1].length + currentSegment.length < |
| 140 | + Number(MAXLEN) |
| 141 | + ) { |
| 142 | + segments[segments.length - 1][1] += currentSegment; |
| 143 | + } else { |
| 144 | + segments.push([true, currentSegment]); |
| 145 | + } |
| 146 | + } |
| 147 | + currentSegment = ""; |
| 148 | + } |
| 149 | + |
| 150 | + if (currentDepth === 1) { |
| 151 | + // We are closing the root element. |
| 152 | + segments.push([false, `</${tagName}>`]); |
| 153 | + } |
| 154 | + |
| 155 | + currentDepth--; |
| 156 | + }); |
| 157 | + |
| 158 | + parser.on("comment", comment => { |
| 159 | + if (isRecording) { |
| 160 | + currentSegment += `<!-- ${comment} -->`; |
| 161 | + } else { |
| 162 | + segments.push([false, `<!-- ${comment} -->`]); |
| 163 | + } |
| 164 | + }); |
| 165 | + |
| 166 | + parser.on("end", async () => { |
| 167 | + resolve(); |
| 168 | + }); |
| 169 | + |
| 170 | + parser.on("error", err => { |
| 171 | + logError(`Parser error in ${filePath}:`, err, filePath); |
| 172 | + // Try to recover and continue |
| 173 | + try { |
| 174 | + parser._parser.error = null; |
| 175 | + parser._parser.resume(); |
| 176 | + } catch (resumeErr) { |
| 177 | + logError(`Could not recover from parser error:`, resumeErr, filePath); |
| 178 | + reject(err); |
| 179 | + } |
| 180 | + }); |
| 181 | + |
| 182 | + // Use the file path directly without modification |
| 183 | + fs.createReadStream(filePath).pipe(parser); |
| 184 | + }); |
| 185 | + |
| 186 | + return segments; |
| 187 | +} |
| 188 | + |
| 189 | +export async function recurSplitParser(ori: string, filePath: PathLike, logError: Function): Promise<string[]> { |
| 190 | + let subTranslated: string[] = []; |
| 191 | + // continue splitting the chunk |
| 192 | + // Create a SAX parser in strict mode to split source into chunks. |
| 193 | + await new Promise<void>((resolve, reject) => { |
| 194 | + const subParser = createParser(); |
| 195 | + |
| 196 | + let subCurrentDepth = 0; |
| 197 | + let subCurrentSegment = ""; |
| 198 | + const subSegments: [boolean, string][] = []; |
| 199 | + let subIsRecording = false; |
| 200 | + |
| 201 | + subParser.on("opentag", node => { |
| 202 | + if (node.name === "WRAPPER") return; |
| 203 | + |
| 204 | + subCurrentDepth++; |
| 205 | + |
| 206 | + if (subCurrentDepth === 2) subIsRecording = true; |
| 207 | + |
| 208 | + if (subIsRecording) { |
| 209 | + subCurrentSegment += `<${node.name}${formatAttributes(node.attributes)}>`; |
| 210 | + } else { |
| 211 | + subSegments.push([ |
| 212 | + false, |
| 213 | + `<${node.name}${formatAttributes(node.attributes)}>` |
| 214 | + ]); |
| 215 | + } |
| 216 | + }); |
| 217 | + |
| 218 | + subParser.on("text", text => { |
| 219 | + text = strongEscapeXML(text); |
| 220 | + if (subIsRecording) { |
| 221 | + subCurrentSegment += text; |
| 222 | + } else if ( |
| 223 | + subSegments.length > 0 && |
| 224 | + subSegments[subSegments.length - 1][0] |
| 225 | + ) { |
| 226 | + subSegments[subSegments.length - 1][1] += text; |
| 227 | + } else if ( |
| 228 | + text.trim() === "" || |
| 229 | + text.trim() === "," || |
| 230 | + text.trim() === "." |
| 231 | + ) { |
| 232 | + subSegments.push([false, text]); |
| 233 | + } else { |
| 234 | + subSegments.push([true, text]); |
| 235 | + } |
| 236 | + }); |
| 237 | + |
| 238 | + subParser.on("cdata", cdata => { |
| 239 | + if (subIsRecording) { |
| 240 | + subCurrentSegment += `<![CDATA[${cdata}]]>`; |
| 241 | + } |
| 242 | + }); |
| 243 | + |
| 244 | + subParser.on("closetag", tagName => { |
| 245 | + if (tagName === "WRAPPER") { |
| 246 | + return; |
| 247 | + } |
| 248 | + |
| 249 | + subCurrentSegment += `</${tagName}>`; |
| 250 | + |
| 251 | + if (subCurrentDepth === 2) { |
| 252 | + // We are closing a segment element. |
| 253 | + if (ignoredTags.includes(tagName)) { |
| 254 | + subSegments.push([false, subCurrentSegment]); |
| 255 | + } else if ( |
| 256 | + subSegments.length > 0 && |
| 257 | + subSegments[subSegments.length - 1][0] && |
| 258 | + subSegments[subSegments.length - 1][1].length + |
| 259 | + subCurrentSegment.length < |
| 260 | + Number(MAXLEN) |
| 261 | + ) { |
| 262 | + subSegments[subSegments.length - 1][1] += subCurrentSegment; |
| 263 | + } else { |
| 264 | + subSegments.push([true, subCurrentSegment]); |
| 265 | + } |
| 266 | + subCurrentSegment = ""; |
| 267 | + subIsRecording = false; |
| 268 | + } |
| 269 | + |
| 270 | + if (subCurrentDepth === 1) { |
| 271 | + subSegments.push([false, `</${tagName}>`]); |
| 272 | + subCurrentSegment = ""; |
| 273 | + } |
| 274 | + |
| 275 | + subCurrentDepth--; |
| 276 | + }); |
| 277 | + |
| 278 | + subParser.on("comment", comment => { |
| 279 | + if (subIsRecording) { |
| 280 | + subCurrentSegment += `<!-- ${comment} -->`; |
| 281 | + } else { |
| 282 | + subSegments.push([false, `<!-- ${comment} -->`]); |
| 283 | + } |
| 284 | + }); |
| 285 | + |
| 286 | + subParser.on("end", async () => |
| 287 | + resolve() |
| 288 | + ); |
| 289 | + |
| 290 | + subParser.on("error", err => { |
| 291 | + logError(`Error in subParser for ${filePath}:`, err, filePath); |
| 292 | + // Try to recover and continue |
| 293 | + try { |
| 294 | + subParser._parser.error = null; |
| 295 | + subParser._parser.resume(); |
| 296 | + } catch (resumeErr) { |
| 297 | + logError(`Could not recover from parser error:`, resumeErr, filePath); |
| 298 | + reject(err); |
| 299 | + } |
| 300 | + }); |
| 301 | + |
| 302 | + Readable.from("<WRAPPER>" + ori + "</WRAPPER>").pipe(subParser); |
| 303 | + }); |
| 304 | + |
| 305 | + return subTranslated; |
| 306 | +} |
0 commit comments