Skip to content

Commit dc6d4a1

Browse files
authored
Updated htmlparser2 to the latest version (#1692)
* Updated htmlparser2 to the latest version * Move null xml condition to the top + remove _parser.reset() * Use WritableStream to parse Buffer xml
1 parent fad2ce0 commit dc6d4a1

File tree

6 files changed

+234
-229
lines changed

6 files changed

+234
-229
lines changed

lib/data/briefcase.js

Lines changed: 73 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
const { Transform } = require('stream');
1111
const hparser = require('htmlparser2');
12+
const { WritableStream } = require('htmlparser2/WritableStream');
1213
const { identity, last } = require('ramda');
1314
const csv = require('csv-stringify');
1415
const sanitize = require('sanitize-filename');
@@ -93,80 +94,87 @@ const processRow = (xml, instanceId, fields, header, selectValues) => new Promis
9394
const dataStack = [ generateDataFrame(header) ];
9495

9596
// now spin up our XML parser and let its SAX-like tree events drive our traversal.
96-
const parser = new hparser.Parser({
97-
onopentag: (name) => {
98-
const field = schemaStack.push(name);
99-
if (field == null) {
100-
// if we don't have a schema definition for this field, navigate into nothing.
101-
dataStack.push(null);
102-
} else if (field.type === 'repeat') {
103-
// we are going to be writing to a new subrow:
104-
const subrow = generateDataFrame(field.header);
105-
subrow[field.meta.key] = keyForStacks(instanceId, schemaStack);
106-
subrow[field.meta.parentKey] =
107-
keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer());
108-
dataStack.push(subrow);
109-
} else {
110-
// for structures and primitive fields, we don't change any pointers
111-
// besides our field, which was already done above.
112-
pushPtr(dataStack);
113-
}
114-
},
115-
ontext: (text) => {
116-
const field = schemaStack.head();
117-
if (field?.idx != null) {
118-
// we have a real schema field for this text value and a place to put the
119-
// value, so inject it into the appropriate spot in the row.
120-
121-
const dataPtr = ptr(dataStack);
122-
if (field.type === 'geopoint') {
123-
const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g);
124-
dataPtr[field.idx] = lat;
125-
dataPtr[field.idx + 1] = lon;
126-
dataPtr[field.idx + 2] = altitude;
127-
dataPtr[field.idx + 3] = accuracy;
97+
const createParser = (ParserClass) => {
98+
const parser = new ParserClass({
99+
onopentag: (name) => {
100+
const field = schemaStack.push(name);
101+
if (field == null) {
102+
// if we don't have a schema definition for this field, navigate into nothing.
103+
dataStack.push(null);
104+
} else if (field.type === 'repeat') {
105+
// we are going to be writing to a new subrow:
106+
const subrow = generateDataFrame(field.header);
107+
subrow[field.meta.key] = keyForStacks(instanceId, schemaStack);
108+
subrow[field.meta.parentKey] =
109+
keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer());
110+
dataStack.push(subrow);
128111
} else {
129-
// we have to account for multiple text events for the same field,
130-
// since for whatever reason entities decode into their own text events.
131-
dataPtr[field.idx] = (dataPtr[field.idx] || '') + text;
132-
133-
if (field.selectMultiple === true) {
134-
// if we are a select multiple and we know about columns for it then we
135-
// need to split and count. TODO: we don't do anything clever to keep from
136-
// recounting the field from scratch on html entity; it's hopefully rare?
137-
const known = selectValues?.[field.path];
138-
if (known != null) {
139-
for (const value of dataPtr[field.idx].split(/\s+/g)) {
140-
const idx = known.indexOf(value);
141-
if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1;
112+
// for structures and primitive fields, we don't change any pointers
113+
// besides our field, which was already done above.
114+
pushPtr(dataStack);
115+
}
116+
},
117+
ontext: (text) => {
118+
const field = schemaStack.head();
119+
if (field?.idx != null) {
120+
// we have a real schema field for this text value and a place to put the
121+
// value, so inject it into the appropriate spot in the row.
122+
123+
const dataPtr = ptr(dataStack);
124+
if (field.type === 'geopoint') {
125+
const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g);
126+
dataPtr[field.idx] = lat;
127+
dataPtr[field.idx + 1] = lon;
128+
dataPtr[field.idx + 2] = altitude;
129+
dataPtr[field.idx + 3] = accuracy;
130+
} else {
131+
// we have to account for multiple text events for the same field,
132+
// since for whatever reason entities decode into their own text events.
133+
dataPtr[field.idx] = (dataPtr[field.idx] || '') + text;
134+
135+
if (field.selectMultiple === true) {
136+
// if we are a select multiple and we know about columns for it then we
137+
// need to split and count. TODO: we don't do anything clever to keep from
138+
// recounting the field from scratch on html entity; it's hopefully rare?
139+
const known = selectValues?.[field.path];
140+
if (known != null) {
141+
for (const value of dataPtr[field.idx].split(/\s+/g)) {
142+
const idx = known.indexOf(value);
143+
if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1;
144+
}
142145
}
143146
}
144147
}
145148
}
149+
},
150+
onclosetag: () => {
151+
// shed a context layer by popping all our state machine stacks.
152+
const field = schemaStack.pop();
153+
const row = dataStack.pop();
154+
155+
// if we popped a repeat, we need to write the subrow we just created.
156+
if (field?.type === 'repeat')
157+
field.stream.write(row);
158+
159+
// if we popped everything, we've hit the close tag. write out a few special
160+
// values, and send the row off to be written by our caller, as it is a different
161+
// stream type/mechanism and we don't have a reference to it..
162+
if (schemaStack.hasExited()) {
163+
if (!(parser instanceof WritableStream)) {
164+
parser.reset();
165+
}
166+
resolve(row);
167+
}
146168
}
147-
},
148-
onclosetag: () => {
149-
// shed a context layer by popping all our state machine stacks.
150-
const field = schemaStack.pop();
151-
const row = dataStack.pop();
152-
153-
// if we popped a repeat, we need to write the subrow we just created.
154-
if (field?.type === 'repeat')
155-
field.stream.write(row);
156-
157-
// if we popped everything, we've hit the close tag. write out a few special
158-
// values, and send the row off to be written by our caller, as it is a different
159-
// stream type/mechanism and we don't have a reference to it..
160-
if (schemaStack.hasExited()) {
161-
parser.reset();
162-
resolve(row);
163-
}
164-
}
165-
}, { xmlMode: true, decodeEntities: true });
169+
}, { xmlMode: true, decodeEntities: true });
170+
171+
return parser;
172+
};
166173

167174
if (xml instanceof PartialPipe) {
168-
xml.with(parser).pipeline(rejectIfError(reject));
175+
xml.with(createParser(WritableStream)).pipeline(rejectIfError(reject));
169176
} else {
177+
const parser = createParser(hparser.Parser);
170178
parser.write(xml);
171179
parser.end();
172180
}

lib/data/schema.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,7 @@ const _versionSplicer = (replace) => (xml, insert) => new Promise((pass, fail) =
549549
// out where the attribute actually is. the parser startIndex and endIndex point
550550
// at the whitespace preceding the tag until the tag is closed. obviously this is
551551
// pretty bad but i don't see a more robust solution right now.
552-
const idx = parser._tokenizer._index;
552+
const idx = parser.tokenizer.index;
553553
parser.reset();
554554
return replace
555555
? pass(`${xml.slice(0, idx - value.length)}${insert}${xml.slice(idx)}`)
@@ -623,7 +623,7 @@ const _updateEntityVersion = (xml, oldVersion, newVersion) => new Promise((pass,
623623
onattribute: (name, value) => {
624624
if ((stripNamespacesFromPath(name) === 'entities-version') && (value === oldVersion)
625625
&& (stack.length) === 2 && (stack[0] === 'html') && (stack[1] === 'head')) {
626-
const idx = parser._tokenizer._index;
626+
const idx = parser.tokenizer.index;
627627
parser.reset();
628628
return pass(`${xml.slice(0, idx - value.length)}${newVersion}${xml.slice(idx)}`);
629629
}

lib/data/submission.js

Lines changed: 49 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
const { Readable } = require('stream');
1111
const { createHash } = require('crypto');
1212
const hparser = require('htmlparser2');
13+
const { WritableStream } = require('htmlparser2/WritableStream');
1314
const fmdiff = require('fast-myers-diff');
1415
const { SchemaStack } = require('./schema');
1516
const { noop } = require('../util/util');
@@ -32,47 +33,59 @@ const { union, last, pluck } = require('ramda');
3233
// to false for whatever you are doing.
3334
const submissionXmlToFieldStream = (fields, xml, includeStructuralAttrs = false, includeEmptyNodes = false) => {
3435
const outStream = new Readable({ objectMode: true, read: noop });
36+
if (!xml) {
37+
outStream.destroy(new Error('Stream ended before stack was exhausted.'));
38+
return outStream;
39+
}
3540

3641
const stack = new SchemaStack(fields, true);
3742
let textBuffer = ''; // agglomerates text nodes that come as multiple events.
38-
const parser = new hparser.Parser({
39-
onend: () => {
40-
if (!stack.hasExited()) {
41-
outStream.destroy(new Error('Stream ended before stack was exhausted.'));
42-
}
43-
},
44-
onopentag: (name, attrs) => {
45-
const field = stack.push(name);
46-
if (field != null) {
47-
textBuffer = '';
48-
// If the field is a structural field AND it has attributes AND we should output them, THEN do so.
49-
if (includeStructuralAttrs &&
50-
(typeof field.isStructural === 'function' && field.isStructural()) &&
51-
Object.keys(attrs).length !== 0)
52-
outStream.push({ field: { ...field, attrs }, text: null });
53-
}
54-
},
55-
ontext: (text) => {
56-
textBuffer += text;
57-
},
58-
onclosetag: () => {
59-
const field = stack.pop();
60-
61-
if (textBuffer !== '' || includeEmptyNodes) {
62-
if ((field != null) && !field.isStructural()) // don't output useless whitespace
63-
outStream.push({ field, text: textBuffer });
64-
textBuffer = '';
65-
}
43+
const createParser = (ParserClass) => {
44+
const parser = new ParserClass({
45+
onend: () => {
46+
if (!stack.hasExited()) {
47+
outStream.destroy(new Error('Stream ended before stack was exhausted.'));
48+
}
49+
},
50+
onopentag: (name, attrs) => {
51+
const field = stack.push(name);
52+
if (field != null) {
53+
textBuffer = '';
54+
// If the field is a structural field AND it has attributes AND we should output them, THEN do so.
55+
if (includeStructuralAttrs &&
56+
(typeof field.isStructural === 'function' && field.isStructural()) &&
57+
Object.keys(attrs).length !== 0)
58+
outStream.push({ field: { ...field, attrs }, text: null });
59+
}
60+
},
61+
ontext: (text) => {
62+
textBuffer += text;
63+
},
64+
onclosetag: () => {
65+
const field = stack.pop();
66+
67+
if (textBuffer !== '' || includeEmptyNodes) {
68+
if ((field != null) && !field.isStructural()) // don't output useless whitespace
69+
outStream.push({ field, text: textBuffer });
70+
textBuffer = '';
71+
}
6672

67-
if (stack.hasExited()) {
68-
parser.reset();
69-
outStream.push(null);
73+
if (stack.hasExited()) {
74+
if (!(parser instanceof WritableStream)) parser.reset();
75+
outStream.push(null);
76+
}
7077
}
71-
}
72-
}, { xmlMode: true, decodeEntities: true });
73-
74-
parser.write(xml);
75-
parser.end();
78+
}, { xmlMode: true, decodeEntities: true });
79+
return parser;
80+
};
81+
82+
if (xml instanceof Buffer) {
83+
Readable.from(xml).pipe(createParser(WritableStream));
84+
} else {
85+
const parser = createParser(hparser.Parser);
86+
parser.write(xml);
87+
parser.end();
88+
}
7689

7790
return outStream;
7891
};

lib/util/xml.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ const traverseXml = (input, initTraversers) => new Promise((resolve, reject) =>
152152
// however we call end, we want to resolve with the results we did have.
153153
// we crush the nothing sentinel value down to Option.none at this point.
154154
onend: () => resolve(results.map((x) => ((x === nothing) ? Option.none() : Option.of(x))))
155-
}, { xmlMode: true });
155+
}, { xmlMode: true, decodeEntities: false });
156156

157157
// actually feed our input into our xml parser.
158158
if (typeof input.pipe === 'function') {
@@ -167,7 +167,11 @@ const traverseXml = (input, initTraversers) => new Promise((resolve, reject) =>
167167
// otherwise we have a string or a buffer (or something crazy that will
168168
// probably crash).
169169
try {
170-
parser.write(input);
170+
if (input instanceof Buffer) {
171+
parser.write(input.toString('utf8'));
172+
} else {
173+
parser.write(input);
174+
}
171175
parser.end();
172176
} catch (ex) { reject(ex); }
173177
}

0 commit comments

Comments
 (0)