|
9 | 9 |
|
10 | 10 | const { Transform } = require('stream'); |
11 | 11 | const hparser = require('htmlparser2'); |
| 12 | +const { WritableStream } = require('htmlparser2/WritableStream'); |
12 | 13 | const { identity, last } = require('ramda'); |
13 | 14 | const csv = require('csv-stringify'); |
14 | 15 | const sanitize = require('sanitize-filename'); |
@@ -93,80 +94,87 @@ const processRow = (xml, instanceId, fields, header, selectValues) => new Promis |
93 | 94 | const dataStack = [ generateDataFrame(header) ]; |
94 | 95 |
|
95 | 96 | // now spin up our XML parser and let its SAX-like tree events drive our traversal. |
96 | | - const parser = new hparser.Parser({ |
97 | | - onopentag: (name) => { |
98 | | - const field = schemaStack.push(name); |
99 | | - if (field == null) { |
100 | | - // if we don't have a schema definition for this field, navigate into nothing. |
101 | | - dataStack.push(null); |
102 | | - } else if (field.type === 'repeat') { |
103 | | - // we are going to be writing to a new subrow: |
104 | | - const subrow = generateDataFrame(field.header); |
105 | | - subrow[field.meta.key] = keyForStacks(instanceId, schemaStack); |
106 | | - subrow[field.meta.parentKey] = |
107 | | - keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer()); |
108 | | - dataStack.push(subrow); |
109 | | - } else { |
110 | | - // for structures and primitive fields, we don't change any pointers |
111 | | - // besides our field, which was already done above. |
112 | | - pushPtr(dataStack); |
113 | | - } |
114 | | - }, |
115 | | - ontext: (text) => { |
116 | | - const field = schemaStack.head(); |
117 | | - if (field?.idx != null) { |
118 | | - // we have a real schema field for this text value and a place to put the |
119 | | - // value, so inject it into the appropriate spot in the row. |
120 | | - |
121 | | - const dataPtr = ptr(dataStack); |
122 | | - if (field.type === 'geopoint') { |
123 | | - const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g); |
124 | | - dataPtr[field.idx] = lat; |
125 | | - dataPtr[field.idx + 1] = lon; |
126 | | - dataPtr[field.idx + 2] = altitude; |
127 | | - dataPtr[field.idx + 3] = accuracy; |
| 97 | + const createParser = (ParserClass) => { |
| 98 | + const parser = new ParserClass({ |
| 99 | + onopentag: (name) => { |
| 100 | + const field = schemaStack.push(name); |
| 101 | + if (field == null) { |
| 102 | + // if we don't have a schema definition for this field, navigate into nothing. |
| 103 | + dataStack.push(null); |
| 104 | + } else if (field.type === 'repeat') { |
| 105 | + // we are going to be writing to a new subrow: |
| 106 | + const subrow = generateDataFrame(field.header); |
| 107 | + subrow[field.meta.key] = keyForStacks(instanceId, schemaStack); |
| 108 | + subrow[field.meta.parentKey] = |
| 109 | + keyForStacks(instanceId, schemaStack, schemaStack.repeatContextSlicer()); |
| 110 | + dataStack.push(subrow); |
128 | 111 | } else { |
129 | | - // we have to account for multiple text events for the same field, |
130 | | - // since for whatever reason entities decode into their own text events. |
131 | | - dataPtr[field.idx] = (dataPtr[field.idx] || '') + text; |
132 | | - |
133 | | - if (field.selectMultiple === true) { |
134 | | - // if we are a select multiple and we know about columns for it then we |
135 | | - // need to split and count. TODO: we don't do anything clever to keep from |
136 | | - // recounting the field from scratch on html entity; it's hopefully rare? |
137 | | - const known = selectValues?.[field.path]; |
138 | | - if (known != null) { |
139 | | - for (const value of dataPtr[field.idx].split(/\s+/g)) { |
140 | | - const idx = known.indexOf(value); |
141 | | - if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1; |
| 112 | + // for structures and primitive fields, we don't change any pointers |
| 113 | + // besides our field, which was already done above. |
| 114 | + pushPtr(dataStack); |
| 115 | + } |
| 116 | + }, |
| 117 | + ontext: (text) => { |
| 118 | + const field = schemaStack.head(); |
| 119 | + if (field?.idx != null) { |
| 120 | + // we have a real schema field for this text value and a place to put the |
| 121 | + // value, so inject it into the appropriate spot in the row. |
| 122 | + |
| 123 | + const dataPtr = ptr(dataStack); |
| 124 | + if (field.type === 'geopoint') { |
| 125 | + const [ lat, lon, altitude, accuracy ] = text.split(/\s+/g); |
| 126 | + dataPtr[field.idx] = lat; |
| 127 | + dataPtr[field.idx + 1] = lon; |
| 128 | + dataPtr[field.idx + 2] = altitude; |
| 129 | + dataPtr[field.idx + 3] = accuracy; |
| 130 | + } else { |
| 131 | + // we have to account for multiple text events for the same field, |
| 132 | + // since for whatever reason entities decode into their own text events. |
| 133 | + dataPtr[field.idx] = (dataPtr[field.idx] || '') + text; |
| 134 | + |
| 135 | + if (field.selectMultiple === true) { |
| 136 | + // if we are a select multiple and we know about columns for it then we |
| 137 | + // need to split and count. TODO: we don't do anything clever to keep from |
| 138 | + // recounting the field from scratch on html entity; it's hopefully rare? |
| 139 | + const known = selectValues?.[field.path]; |
| 140 | + if (known != null) { |
| 141 | + for (const value of dataPtr[field.idx].split(/\s+/g)) { |
| 142 | + const idx = known.indexOf(value); |
| 143 | + if (idx >= 0) dataPtr[field.idx + 1 + idx] = 1; |
| 144 | + } |
142 | 145 | } |
143 | 146 | } |
144 | 147 | } |
145 | 148 | } |
| 149 | + }, |
| 150 | + onclosetag: () => { |
| 151 | + // shed a context layer by popping all our state machine stacks. |
| 152 | + const field = schemaStack.pop(); |
| 153 | + const row = dataStack.pop(); |
| 154 | + |
| 155 | + // if we popped a repeat, we need to write the subrow we just created. |
| 156 | + if (field?.type === 'repeat') |
| 157 | + field.stream.write(row); |
| 158 | + |
| 159 | + // if we popped everything, we've hit the close tag. write out a few special |
| 160 | + // values, and send the row off to be written by our caller, as it is a different |
| 161 | + // stream type/mechanism and we don't have a reference to it.. |
| 162 | + if (schemaStack.hasExited()) { |
| 163 | + if (!(parser instanceof WritableStream)) { |
| 164 | + parser.reset(); |
| 165 | + } |
| 166 | + resolve(row); |
| 167 | + } |
146 | 168 | } |
147 | | - }, |
148 | | - onclosetag: () => { |
149 | | - // shed a context layer by popping all our state machine stacks. |
150 | | - const field = schemaStack.pop(); |
151 | | - const row = dataStack.pop(); |
152 | | - |
153 | | - // if we popped a repeat, we need to write the subrow we just created. |
154 | | - if (field?.type === 'repeat') |
155 | | - field.stream.write(row); |
156 | | - |
157 | | - // if we popped everything, we've hit the close tag. write out a few special |
158 | | - // values, and send the row off to be written by our caller, as it is a different |
159 | | - // stream type/mechanism and we don't have a reference to it.. |
160 | | - if (schemaStack.hasExited()) { |
161 | | - parser.reset(); |
162 | | - resolve(row); |
163 | | - } |
164 | | - } |
165 | | - }, { xmlMode: true, decodeEntities: true }); |
| 169 | + }, { xmlMode: true, decodeEntities: true }); |
| 170 | + |
| 171 | + return parser; |
| 172 | + }; |
166 | 173 |
|
167 | 174 | if (xml instanceof PartialPipe) { |
168 | | - xml.with(parser).pipeline(rejectIfError(reject)); |
| 175 | + xml.with(createParser(WritableStream)).pipeline(rejectIfError(reject)); |
169 | 176 | } else { |
| 177 | + const parser = createParser(hparser.Parser); |
170 | 178 | parser.write(xml); |
171 | 179 | parser.end(); |
172 | 180 | } |
|
0 commit comments