@@ -6,6 +6,7 @@ import dotenv from "dotenv";
6
6
import sax from "sax" ;
7
7
import { Readable } from "stream" ;
8
8
import { fileURLToPath } from "url" ;
9
+ import { isGeneratorObject } from "util/types" ;
9
10
10
11
dotenv . config ( ) ;
11
12
@@ -19,6 +20,8 @@ const ai = new OpenAI({
19
20
baseURL : process . env . AI_BASEURL
20
21
} ) ;
21
22
23
+ const ignoredTags = [ "LATEXINLINE" , "LATEX" , "SNIPPET" , "SCHEMEINLINE" , "SCHEME" , "LONG_PAGE" , "LABEL" ] ;
24
+
22
25
const MAXLEN = Number ( process . env . MAX_LEN ) || 3000 ;
23
26
24
27
const createParser = ( ) =>
@@ -58,8 +61,8 @@ async function recursivelyTranslate(
58
61
path : string
59
62
) : Promise < string > {
60
63
// Recursive function to split and translate
61
- async function helper ( ori : string , force : boolean ) : Promise < string > {
62
- if ( ori . length < MAXLEN && ! force ) {
64
+ async function helper ( ori : string ) : Promise < string > {
65
+ if ( ori . length < MAXLEN ) {
63
66
return await translateChunk ( ori ) ; // translate the chunk
64
67
}
65
68
@@ -69,21 +72,19 @@ async function recursivelyTranslate(
69
72
await new Promise < void > ( ( resolve , reject ) => {
70
73
const subParser = createParser ( ) ;
71
74
72
- let subCurrentDepth = 1 ;
75
+ let subCurrentDepth = 0 ;
73
76
let subCurrentSegment = "" ;
74
77
const subSegments : [ boolean , string ] [ ] = [ ] ;
75
78
let subIsRecording = false ;
76
79
77
80
subParser . on ( "opentag" , node => {
78
- if ( node . name === "WRAPPER" ) {
79
- return ;
80
- }
81
-
81
+ if ( node . name === "WRAPPER" ) return ;
82
+
82
83
subCurrentDepth ++ ;
83
84
84
- // If we're at depth 2, this is the start of a new segment.
85
- if ( subCurrentDepth === 2 || subIsRecording ) {
86
- subIsRecording = true ;
85
+ if ( subCurrentDepth === 2 ) subIsRecording = true ;
86
+
87
+ if ( subIsRecording ) {
87
88
subCurrentSegment += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
88
89
} else {
89
90
subSegments . push ( [
@@ -97,21 +98,19 @@ async function recursivelyTranslate(
97
98
text = strongEscapeXML ( text ) ;
98
99
if ( subIsRecording ) {
99
100
subCurrentSegment += text ;
101
+ } else if (
102
+ subSegments . length > 0 &&
103
+ subSegments [ subSegments . length - 1 ] [ 0 ]
104
+ ) {
105
+ subSegments [ subSegments . length - 1 ] [ 1 ] += text ;
106
+ } else if (
107
+ text . trim ( ) === "" ||
108
+ text . trim ( ) === "," ||
109
+ text . trim ( ) === "."
110
+ ) {
111
+ subSegments . push ( [ false , text ] ) ;
100
112
} else {
101
- if (
102
- subSegments . length > 0 &&
103
- subSegments [ subSegments . length - 1 ] [ 0 ]
104
- ) {
105
- subSegments [ subSegments . length - 1 ] [ 1 ] += text ;
106
- } else if (
107
- text . trim ( ) !== "" ||
108
- text . trim ( ) === "," ||
109
- text . trim ( ) === "."
110
- ) {
111
- subSegments . push ( [ false , text ] ) ;
112
- } else {
113
- subSegments . push ( [ true , text ] ) ;
114
- }
113
+ subSegments . push ( [ true , text ] ) ;
115
114
}
116
115
} ) ;
117
116
@@ -125,41 +124,35 @@ async function recursivelyTranslate(
125
124
if ( tagName === "WRAPPER" ) {
126
125
return ;
127
126
}
128
-
129
- if ( subIsRecording ) {
130
- subCurrentSegment += `</${ tagName } >` ;
131
- }
127
+
128
+ subCurrentSegment += `</${ tagName } >` ;
132
129
133
130
if ( subCurrentDepth === 2 ) {
134
131
// We are closing a segment element.
135
132
if (
136
- tagName === "LATEXINLINE" ||
137
- tagName === "LATEX" ||
138
- tagName === "SNIPPET" ||
139
- tagName === "SCHEMEINLINE"
133
+ ignoredTags . includes ( tagName )
140
134
) {
141
135
subSegments . push ( [ false , subCurrentSegment ] ) ;
136
+ } else if (
137
+ subSegments . length > 0 &&
138
+ subSegments [ subSegments . length - 1 ] [ 0 ] &&
139
+ subSegments [ subSegments . length - 1 ] [ 1 ] . length +
140
+ subCurrentSegment . length <
141
+ MAXLEN
142
+ ) {
143
+ subSegments [ subSegments . length - 1 ] [ 1 ] += subCurrentSegment ;
142
144
} else {
143
- if (
144
- subSegments . length > 0 &&
145
- subSegments [ subSegments . length - 1 ] [ 0 ] &&
146
- ( subSegments [ subSegments . length - 1 ] [ 1 ] . length +
147
- subCurrentSegment . length ) <
148
- MAXLEN
149
- ) {
150
- subSegments [ subSegments . length - 1 ] [ 1 ] += subCurrentSegment ;
151
- } else {
152
145
subSegments . push ( [ true , subCurrentSegment ] ) ;
153
- }
154
146
}
155
147
subCurrentSegment = "" ;
156
148
subIsRecording = false ;
157
149
}
158
-
150
+
159
151
if ( subCurrentDepth === 1 ) {
160
- // We are closing the root element.
161
- subSegments . push ( [ false , `</ ${ tagName } >` ] ) ;
152
+ subSegments . push ( [ false , `</ ${ tagName } >` ] )
153
+ subCurrentSegment = "" ;
162
154
}
155
+
163
156
subCurrentDepth -- ;
164
157
} ) ;
165
158
@@ -174,7 +167,7 @@ async function recursivelyTranslate(
174
167
subParser . on ( "end" , async ( ) => {
175
168
for ( const segment of subSegments ) {
176
169
if ( segment [ 0 ] ) {
177
- subTranslated . push ( await helper ( segment [ 1 ] , false ) ) ;
170
+ subTranslated . push ( await helper ( segment [ 1 ] ) ) ;
178
171
} else {
179
172
subTranslated . push ( segment [ 1 ] ) ;
180
173
}
@@ -248,28 +241,23 @@ async function recursivelyTranslate(
248
241
}
249
242
250
243
if ( currentDepth === 2 ) {
244
+ isRecording = false ;
251
245
// We are closing a segment element.
252
- if (
253
- tagName === "LATEXINLINE" ||
254
- tagName === "LATEX" ||
255
- tagName === "SNIPPET" ||
256
- tagName === "SCHEMEINLINE" ||
257
- tagName === "SCHEME"
258
- ) {
246
+ if ( ignoredTags . includes ( tagName ) ) {
259
247
segments . push ( [ false , currentSegment ] ) ;
260
248
} else {
261
249
if (
262
250
segments . length > 0 &&
263
251
segments [ segments . length - 1 ] [ 0 ] &&
264
- ( segments [ segments . length - 1 ] [ 1 ] . length +
265
- currentSegment . length ) <
252
+ segments [ segments . length - 1 ] [ 1 ] . length + currentSegment . length <
266
253
MAXLEN
267
254
) {
268
255
segments [ segments . length - 1 ] [ 1 ] += currentSegment ;
269
256
} else {
270
- segments . push ( [ true , currentSegment ] ) ;
257
+ segments . push ( [ true , currentSegment ] ) ;
271
258
}
272
259
}
260
+ currentSegment = "" ;
273
261
}
274
262
275
263
if ( currentDepth === 1 ) {
@@ -291,7 +279,7 @@ async function recursivelyTranslate(
291
279
parser . on ( "end" , async ( ) => {
292
280
for ( const segment of segments ) {
293
281
if ( segment [ 0 ] ) {
294
- translated . push ( await helper ( segment [ 1 ] , false ) ) ;
282
+ translated . push ( await helper ( segment [ 1 ] ) ) ;
295
283
} else {
296
284
translated . push ( segment [ 1 ] ) ;
297
285
}
@@ -314,12 +302,12 @@ async function recursivelyTranslate(
314
302
if ( chunk . trim ( ) === "" || chunk . trim ( ) === "," || chunk . trim ( ) === "." ) {
315
303
return chunk ;
316
304
}
317
-
305
+
318
306
// console.log("Translating chunk of length: " + chunk.length);
319
- if ( chunk . length < 100 ) {
320
- console . log ( "\nchunk: " + chunk )
321
- }
322
-
307
+ // if (chunk.length < 100) {
308
+ // console.log("\nchunk: " + chunk);
309
+ // }
310
+
323
311
let translatedChunk = "" ;
324
312
325
313
try {
@@ -332,7 +320,7 @@ async function recursivelyTranslate(
332
320
Content to translate:
333
321
<TRANSLATE> ${ chunk } </TRANSLATE>`
334
322
} ) ;
335
-
323
+
336
324
const run = await ai . beta . threads . runs . createAndPoll ( thread . id , {
337
325
assistant_id : assistant_id
338
326
} ) ;
@@ -353,7 +341,7 @@ async function recursivelyTranslate(
353
341
const text = messageContent . text ;
354
342
355
343
const safeText = escapeXML ( text . value ) ;
356
- console . log ( safeText ) ;
344
+ // const safeText = chunk ;
357
345
const textStream = Readable . from ( "<WRAPPER>" + safeText + "</WRAPPER>" ) ;
358
346
359
347
await new Promise < void > ( ( resolve , reject ) => {
@@ -394,13 +382,18 @@ async function recursivelyTranslate(
394
382
clean . on ( "error" , error => {
395
383
console . log (
396
384
"error encountered when validating XML: " +
397
- error + "\nfile: " + path +
385
+ error +
386
+ "\nfile: " +
387
+ path +
398
388
"\n section: " +
399
- ( safeText . length > 50 ? safeText . substring ( 0 , 100 ) + "..." : safeText )
389
+ safeText +
390
+ "\n original text: " +
391
+ chunk
400
392
) ;
401
393
402
394
// Attempt to recover using the internal parser
403
395
try {
396
+ clean . _parser . error = null ;
404
397
clean . _parser . resume ( ) ;
405
398
} catch ( e ) {
406
399
console . log ( "Failed to resume parser:" , e ) ;
0 commit comments