Updated htmlparser2 to the latest version (#1692)

sadiqkhoja · web-flow · commit dc6d4a1167ed · 2025-12-04T09:14:57.000-05:00
* Updated htmlparser2 to the latest version

* Move null xml condition to the top

+ remove _parser.reset()

* Use WritableStream to parse Buffer xml
diff --git a/lib/data/briefcase.js b/lib/data/briefcase.js
@@ -9,6 +9,7 @@
 
 const { Transform } = require('stream');
 const hparser = require('htmlparser2');
+const { WritableStream } = require('htmlparser2/WritableStream');
 const { identity, last } = require('ramda');
 const csv = require('csv-stringify');
 const sanitize = require('sanitize-filename');
@@ -93,80 +94,87 @@ const processRow = (xml, instanceId, fields, header, selectValues) => new Promis
   const dataStack = [ generateDataFrame(header) ];
 
   // now spin up our XML parser and let its SAX-like tree events drive our traversal.
-  const parser = new hparser.Parser({
-    onopentag: (name) => {
-      const field = schemaStack.push(name);
-      if (field == null) {
-        // if we don't have a schema definition for this field, navigate into nothing.
-        dataStack.push(null);
-      } else if (field.type === 'repeat') {
-        // we are going to be writing to a new subrow:
-        const subrow = generateDataFrame(field.header);
-        subrow[field.meta.key] = keyForStacks(instanceId, schemaStack);
-        subrow[field.meta.parentKey] =
-          keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer());
-        dataStack.push(subrow);
-      } else {
-        // for structures and primitive fields, we don't change any pointers
-        // besides our field, which was already done above.
-        pushPtr(dataStack);
-      }
-    },
-    ontext: (text) => {
-      const field = schemaStack.head();
-      if (field?.idx != null) {
-        // we have a real schema field for this text value and a place to put the
-        // value, so inject it into the appropriate spot in the row.
-
-        const dataPtr = ptr(dataStack);
-        if (field.type === 'geopoint') {
-          const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g);
-          dataPtr[field.idx] = lat;
-          dataPtr[field.idx + 1] = lon;
-          dataPtr[field.idx + 2] = altitude;
-          dataPtr[field.idx + 3] = accuracy;
+  const createParser = (ParserClass) => {
+    const parser = new ParserClass({
+      onopentag: (name) => {
+        const field = schemaStack.push(name);
+        if (field == null) {
+          // if we don't have a schema definition for this field, navigate into nothing.
+          dataStack.push(null);
+        } else if (field.type === 'repeat') {
+          // we are going to be writing to a new subrow:
+          const subrow = generateDataFrame(field.header);
+          subrow[field.meta.key] = keyForStacks(instanceId, schemaStack);
+          subrow[field.meta.parentKey] =
+            keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer());
+          dataStack.push(subrow);
         } else {
-          // we have to account for multiple text events for the same field,
-          // since for whatever reason entities decode into their own text events.
-          dataPtr[field.idx] = (dataPtr[field.idx] || '') + text;
-
-          if (field.selectMultiple === true) {
-            // if we are a select multiple and we know about columns for it then we
-            // need to split and count. TODO: we don't do anything clever to keep from
-            // recounting the field from scratch on html entity; it's hopefully rare?
-            const known = selectValues?.[field.path];
-            if (known != null) {
-              for (const value of dataPtr[field.idx].split(/\s+/g)) {
-                const idx = known.indexOf(value);
-                if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1;
+          // for structures and primitive fields, we don't change any pointers
+          // besides our field, which was already done above.
+          pushPtr(dataStack);
+        }
+      },
+      ontext: (text) => {
+        const field = schemaStack.head();
+        if (field?.idx != null) {
+          // we have a real schema field for this text value and a place to put the
+          // value, so inject it into the appropriate spot in the row.
+
+          const dataPtr = ptr(dataStack);
+          if (field.type === 'geopoint') {
+            const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g);
+            dataPtr[field.idx] = lat;
+            dataPtr[field.idx + 1] = lon;
+            dataPtr[field.idx + 2] = altitude;
+            dataPtr[field.idx + 3] = accuracy;
+          } else {
+            // we have to account for multiple text events for the same field,
+            // since for whatever reason entities decode into their own text events.
+            dataPtr[field.idx] = (dataPtr[field.idx] || '') + text;
+
+            if (field.selectMultiple === true) {
+              // if we are a select multiple and we know about columns for it then we
+              // need to split and count. TODO: we don't do anything clever to keep from
+              // recounting the field from scratch on html entity; it's hopefully rare?
+              const known = selectValues?.[field.path];
+              if (known != null) {
+                for (const value of dataPtr[field.idx].split(/\s+/g)) {
+                  const idx = known.indexOf(value);
+                  if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1;
+                }
               }
             }
           }
         }
+      },
+      onclosetag: () => {
+        // shed a context layer by popping all our state machine stacks.
+        const field = schemaStack.pop();
+        const row = dataStack.pop();
+
+        // if we popped a repeat, we need to write the subrow we just created.
+        if (field?.type === 'repeat')
+          field.stream.write(row);
+
+        // if we popped everything, we've hit the close tag. write out a few special
+        // values, and send the row off to be written by our caller, as it is a different
+        // stream type/mechanism and we don't have a reference to it..
+        if (schemaStack.hasExited()) {
+          if (!(parser instanceof WritableStream)) {
+            parser.reset();
+          }
+          resolve(row);
+        }
       }
-    },
-    onclosetag: () => {
-      // shed a context layer by popping all our state machine stacks.
-      const field = schemaStack.pop();
-      const row = dataStack.pop();
-
-      // if we popped a repeat, we need to write the subrow we just created.
-      if (field?.type === 'repeat')
-        field.stream.write(row);
-
-      // if we popped everything, we've hit the close tag. write out a few special
-      // values, and send the row off to be written by our caller, as it is a different
-      // stream type/mechanism and we don't have a reference to it..
-      if (schemaStack.hasExited()) {
-        parser.reset();
-        resolve(row);
-      }
-    }
-  }, { xmlMode: true, decodeEntities: true });
+    }, { xmlMode: true, decodeEntities: true });
+
+    return parser;
+  };
 
   if (xml instanceof PartialPipe) {
-    xml.with(parser).pipeline(rejectIfError(reject));
+    xml.with(createParser(WritableStream)).pipeline(rejectIfError(reject));
   } else {
+    const parser = createParser(hparser.Parser);
     parser.write(xml);
     parser.end();
   }
diff --git a/lib/data/schema.js b/lib/data/schema.js
@@ -549,7 +549,7 @@ const _versionSplicer = (replace) => (xml, insert) => new Promise((pass, fail) =
         // out where the attribute actually is. the parser startIndex and endIndex point
         // at the whitespace preceding the tag until the tag is closed. obviously this is
         // pretty bad but i don't see a more robust solution right now.
-        const idx = parser._tokenizer._index;
+        const idx = parser.tokenizer.index;
         parser.reset();
         return replace
           ? pass(`${xml.slice(0, idx - value.length)}${insert}${xml.slice(idx)}`)
@@ -623,7 +623,7 @@ const _updateEntityVersion = (xml, oldVersion, newVersion) => new Promise((pass,
     onattribute: (name, value) => {
       if ((stripNamespacesFromPath(name) === 'entities-version') && (value === oldVersion)
         && (stack.length) === 2 && (stack[0] === 'html') && (stack[1] === 'head')) {
-        const idx = parser._tokenizer._index;
+        const idx = parser.tokenizer.index;
         parser.reset();
         return pass(`${xml.slice(0, idx - value.length)}${newVersion}${xml.slice(idx)}`);
       }
diff --git a/lib/data/submission.js b/lib/data/submission.js
@@ -10,6 +10,7 @@
 const { Readable } = require('stream');
 const { createHash } = require('crypto');
 const hparser = require('htmlparser2');
+const { WritableStream } = require('htmlparser2/WritableStream');
 const fmdiff = require('fast-myers-diff');
 const { SchemaStack } = require('./schema');
 const { noop } = require('../util/util');
@@ -32,47 +33,59 @@ const { union, last, pluck } = require('ramda');
 // to false for whatever you are doing.
 const submissionXmlToFieldStream = (fields, xml, includeStructuralAttrs = false, includeEmptyNodes = false) => {
   const outStream = new Readable({ objectMode: true, read: noop });
+  if (!xml) {
+    outStream.destroy(new Error('Stream ended before stack was exhausted.'));
+    return outStream;
+  }
 
   const stack = new SchemaStack(fields, true);
   let textBuffer = ''; // agglomerates text nodes that come as multiple events.
-  const parser = new hparser.Parser({
-    onend: () => {
-      if (!stack.hasExited()) {
-        outStream.destroy(new Error('Stream ended before stack was exhausted.'));
-      }
-    },
-    onopentag: (name, attrs) => {
-      const field = stack.push(name);
-      if (field != null) {
-        textBuffer = '';
-        // If the field is a structural field AND it has attributes AND we should output them, THEN do so.
-        if (includeStructuralAttrs &&
-          (typeof field.isStructural === 'function' && field.isStructural()) &&
-          Object.keys(attrs).length !== 0)
-          outStream.push({ field: { ...field, attrs }, text: null });
-      }
-    },
-    ontext: (text) => {
-      textBuffer += text;
-    },
-    onclosetag: () => {
-      const field = stack.pop();
-
-      if (textBuffer !== '' || includeEmptyNodes) {
-        if ((field != null) && !field.isStructural()) // don't output useless whitespace
-          outStream.push({ field, text: textBuffer });
-        textBuffer = '';
-      }
+  const createParser = (ParserClass) => {
+    const parser = new ParserClass({
+      onend: () => {
+        if (!stack.hasExited()) {
+          outStream.destroy(new Error('Stream ended before stack was exhausted.'));
+        }
+      },
+      onopentag: (name, attrs) => {
+        const field = stack.push(name);
+        if (field != null) {
+          textBuffer = '';
+          // If the field is a structural field AND it has attributes AND we should output them, THEN do so.
+          if (includeStructuralAttrs &&
+            (typeof field.isStructural === 'function' && field.isStructural()) &&
+            Object.keys(attrs).length !== 0)
+            outStream.push({ field: { ...field, attrs }, text: null });
+        }
+      },
+      ontext: (text) => {
+        textBuffer += text;
+      },
+      onclosetag: () => {
+        const field = stack.pop();
+
+        if (textBuffer !== '' || includeEmptyNodes) {
+          if ((field != null) && !field.isStructural()) // don't output useless whitespace
+            outStream.push({ field, text: textBuffer });
+          textBuffer = '';
+        }
 
-      if (stack.hasExited()) {
-        parser.reset();
-        outStream.push(null);
+        if (stack.hasExited()) {
+          if (!(parser instanceof WritableStream)) parser.reset();
+          outStream.push(null);
+        }
       }
-    }
-  }, { xmlMode: true, decodeEntities: true });
-
-  parser.write(xml);
-  parser.end();
+    }, { xmlMode: true, decodeEntities: true });
+    return parser;
+  };
+
+  if (xml instanceof Buffer) {
+    Readable.from(xml).pipe(createParser(WritableStream));
+  } else {
+    const parser = createParser(hparser.Parser);
+    parser.write(xml);
+    parser.end();
+  }
 
   return outStream;
 };
diff --git a/lib/util/xml.js b/lib/util/xml.js
@@ -152,7 +152,7 @@ const traverseXml = (input, initTraversers) => new Promise((resolve, reject) =>
     // however we call end, we want to resolve with the results we did have.
     // we crush the nothing sentinel value down to Option.none at this point.
     onend: () => resolve(results.map((x) => ((x === nothing) ? Option.none() : Option.of(x))))
-  }, { xmlMode: true });
+  }, { xmlMode: true, decodeEntities: false });
 
   // actually feed our input into our xml parser.
   if (typeof input.pipe === 'function') {
@@ -167,7 +167,11 @@ const traverseXml = (input, initTraversers) => new Promise((resolve, reject) =>
     // otherwise we have a string or a buffer (or something crazy that will
     // probably crash).
     try {
-      parser.write(input);
+      if (input instanceof Buffer) {
+        parser.write(input.toString('utf8'));
+      } else {
+        parser.write(input);
+      }
       parser.end();
     } catch (ex) { reject(ex); }
   }
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json