Skip to content

Commit 32deca2

Browse files
committed
Updated htmlparser2 to the latest version
1 parent 373fd05 commit 32deca2

File tree

6 files changed

+197
-195
lines changed

6 files changed

+197
-195
lines changed

lib/data/briefcase.js

Lines changed: 75 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
const { Transform } = require('stream');
1111
const hparser = require('htmlparser2');
12+
const { WritableStream } = require('htmlparser2/WritableStream');
1213
const { identity, last } = require('ramda');
1314
const csv = require('csv-stringify');
1415
const sanitize = require('sanitize-filename');
@@ -93,80 +94,89 @@ const processRow = (xml, instanceId, fields, header, selectValues) => new Promis
9394
const dataStack = [ generateDataFrame(header) ];
9495

9596
// now spin up our XML parser and let its SAX-like tree events drive our traversal.
96-
const parser = new hparser.Parser({
97-
onopentag: (name) => {
98-
const field = schemaStack.push(name);
99-
if (field == null) {
100-
// if we don't have a schema definition for this field, navigate into nothing.
101-
dataStack.push(null);
102-
} else if (field.type === 'repeat') {
103-
// we are going to be writing to a new subrow:
104-
const subrow = generateDataFrame(field.header);
105-
subrow[field.meta.key] = keyForStacks(instanceId, schemaStack);
106-
subrow[field.meta.parentKey] =
107-
keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer());
108-
dataStack.push(subrow);
109-
} else {
110-
// for structures and primitive fields, we don't change any pointers
111-
// besides our field, which was already done above.
112-
pushPtr(dataStack);
113-
}
114-
},
115-
ontext: (text) => {
116-
const field = schemaStack.head();
117-
if (field?.idx != null) {
118-
// we have a real schema field for this text value and a place to put the
119-
// value, so inject it into the appropriate spot in the row.
120-
121-
const dataPtr = ptr(dataStack);
122-
if (field.type === 'geopoint') {
123-
const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g);
124-
dataPtr[field.idx] = lat;
125-
dataPtr[field.idx + 1] = lon;
126-
dataPtr[field.idx + 2] = altitude;
127-
dataPtr[field.idx + 3] = accuracy;
97+
const createParser = (ParserClass) => {
98+
const parser = new ParserClass({
99+
onopentag: (name) => {
100+
const field = schemaStack.push(name);
101+
if (field == null) {
102+
// if we don't have a schema definition for this field, navigate into nothing.
103+
dataStack.push(null);
104+
} else if (field.type === 'repeat') {
105+
// we are going to be writing to a new subrow:
106+
const subrow = generateDataFrame(field.header);
107+
subrow[field.meta.key] = keyForStacks(instanceId, schemaStack);
108+
subrow[field.meta.parentKey] =
109+
keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer());
110+
dataStack.push(subrow);
128111
} else {
129-
// we have to account for multiple text events for the same field,
130-
// since for whatever reason entities decode into their own text events.
131-
dataPtr[field.idx] = (dataPtr[field.idx] || '') + text;
132-
133-
if (field.selectMultiple === true) {
134-
// if we are a select multiple and we know about columns for it then we
135-
// need to split and count. TODO: we don't do anything clever to keep from
136-
// recounting the field from scratch on html entity; it's hopefully rare?
137-
const known = selectValues?.[field.path];
138-
if (known != null) {
139-
for (const value of dataPtr[field.idx].split(/\s+/g)) {
140-
const idx = known.indexOf(value);
141-
if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1;
112+
// for structures and primitive fields, we don't change any pointers
113+
// besides our field, which was already done above.
114+
pushPtr(dataStack);
115+
}
116+
},
117+
ontext: (text) => {
118+
const field = schemaStack.head();
119+
if (field?.idx != null) {
120+
// we have a real schema field for this text value and a place to put the
121+
// value, so inject it into the appropriate spot in the row.
122+
123+
const dataPtr = ptr(dataStack);
124+
if (field.type === 'geopoint') {
125+
const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g);
126+
dataPtr[field.idx] = lat;
127+
dataPtr[field.idx + 1] = lon;
128+
dataPtr[field.idx + 2] = altitude;
129+
dataPtr[field.idx + 3] = accuracy;
130+
} else {
131+
// we have to account for multiple text events for the same field,
132+
// since for whatever reason entities decode into their own text events.
133+
dataPtr[field.idx] = (dataPtr[field.idx] || '') + text;
134+
135+
if (field.selectMultiple === true) {
136+
// if we are a select multiple and we know about columns for it then we
137+
// need to split and count. TODO: we don't do anything clever to keep from
138+
// recounting the field from scratch on html entity; it's hopefully rare?
139+
const known = selectValues?.[field.path];
140+
if (known != null) {
141+
for (const value of dataPtr[field.idx].split(/\s+/g)) {
142+
const idx = known.indexOf(value);
143+
if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1;
144+
}
142145
}
143146
}
144147
}
145148
}
149+
},
150+
onclosetag: () => {
151+
// shed a context layer by popping all our state machine stacks.
152+
const field = schemaStack.pop();
153+
const row = dataStack.pop();
154+
155+
// if we popped a repeat, we need to write the subrow we just created.
156+
if (field?.type === 'repeat')
157+
field.stream.write(row);
158+
159+
// if we popped everything, we've hit the close tag. write out a few special
160+
// values, and send the row off to be written by our caller, as it is a different
161+
// stream type/mechanism and we don't have a reference to it..
162+
if (schemaStack.hasExited()) {
163+
if (parser instanceof WritableStream) {
164+
parser._parser.reset();
165+
} else {
166+
parser.reset();
167+
}
168+
resolve(row);
169+
}
146170
}
147-
},
148-
onclosetag: () => {
149-
// shed a context layer by popping all our state machine stacks.
150-
const field = schemaStack.pop();
151-
const row = dataStack.pop();
152-
153-
// if we popped a repeat, we need to write the subrow we just created.
154-
if (field?.type === 'repeat')
155-
field.stream.write(row);
156-
157-
// if we popped everything, we've hit the close tag. write out a few special
158-
// values, and send the row off to be written by our caller, as it is a different
159-
// stream type/mechanism and we don't have a reference to it..
160-
if (schemaStack.hasExited()) {
161-
parser.reset();
162-
resolve(row);
163-
}
164-
}
165-
}, { xmlMode: true, decodeEntities: true });
171+
}, { xmlMode: true, decodeEntities: true });
172+
173+
return parser;
174+
};
166175

167176
if (xml instanceof PartialPipe) {
168-
xml.with(parser).pipeline(rejectIfError(reject));
177+
xml.with(createParser(WritableStream)).pipeline(rejectIfError(reject));
169178
} else {
179+
const parser = createParser(hparser.Parser);
170180
parser.write(xml);
171181
parser.end();
172182
}

lib/data/schema.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,7 @@ const _versionSplicer = (replace) => (xml, insert) => new Promise((pass, fail) =
549549
// out where the attribute actually is. the parser startIndex and endIndex point
550550
// at the whitespace preceding the tag until the tag is closed. obviously this is
551551
// pretty bad but i don't see a more robust solution right now.
552-
const idx = parser._tokenizer._index;
552+
const idx = parser.tokenizer.index;
553553
parser.reset();
554554
return replace
555555
? pass(`${xml.slice(0, idx - value.length)}${insert}${xml.slice(idx)}`)
@@ -623,7 +623,7 @@ const _updateEntityVersion = (xml, oldVersion, newVersion) => new Promise((pass,
623623
onattribute: (name, value) => {
624624
if ((stripNamespacesFromPath(name) === 'entities-version') && (value === oldVersion)
625625
&& (stack.length) === 2 && (stack[0] === 'html') && (stack[1] === 'head')) {
626-
const idx = parser._tokenizer._index;
626+
const idx = parser.tokenizer.index;
627627
parser.reset();
628628
return pass(`${xml.slice(0, idx - value.length)}${newVersion}${xml.slice(idx)}`);
629629
}

lib/data/submission.js

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,16 @@ const submissionXmlToFieldStream = (fields, xml, includeStructuralAttrs = false,
7171
}
7272
}, { xmlMode: true, decodeEntities: true });
7373

74-
parser.write(xml);
75-
parser.end();
74+
if (!xml) {
75+
outStream.destroy(new Error('Stream ended before stack was exhausted.'));
76+
} else {
77+
if (xml instanceof Buffer) {
78+
parser.write(xml.toString('utf8'));
79+
} else {
80+
parser.write(xml);
81+
}
82+
parser.end();
83+
}
7684

7785
return outStream;
7886
};

lib/util/xml.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ const traverseXml = (input, initTraversers) => new Promise((resolve, reject) =>
152152
// however we call end, we want to resolve with the results we did have.
153153
// we crush the nothing sentinel value down to Option.none at this point.
154154
onend: () => resolve(results.map((x) => ((x === nothing) ? Option.none() : Option.of(x))))
155-
}, { xmlMode: true });
155+
}, { xmlMode: true, decodeEntities: false });
156156

157157
// actually feed our input into our xml parser.
158158
if (typeof input.pipe === 'function') {
@@ -167,7 +167,11 @@ const traverseXml = (input, initTraversers) => new Promise((resolve, reject) =>
167167
// otherwise we have a string or a buffer (or something crazy that will
168168
// probably crash).
169169
try {
170-
parser.write(input);
170+
if (input instanceof Buffer) {
171+
parser.write(input.toString('utf8'));
172+
} else {
173+
parser.write(input);
174+
}
171175
parser.end();
172176
} catch (ex) { reject(ex); }
173177
}

0 commit comments

Comments
 (0)