2828 x === RawDocument ? Document :
2929 nothing
3030
31+ # struct XMLSpaceContext
32+ # preserve_space::Vector{Bool} # Stack to track xml:space state
33+ # end
34+ # XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving
35+
3136# -----------------------------------------------------------------------------# Raw
3237"""
3338 Raw(filename::String)
@@ -64,8 +69,10 @@ struct Raw
6469 pos:: Int
6570 len:: Int
6671 data:: Vector{UInt8}
72+ ctx:: Vector{Bool} # Context for xml:space (Vector so mutable)
6773end
68- Raw (data:: Vector{UInt8} ) = Raw (RawDocument, 0 , 0 , 0 , data)
74+ Raw (data:: Vector{UInt8} , ctx= [false ]) = Raw (RawDocument, 0 , 0 , 0 , data, ctx)
75+
6976
7077Base. read (filename:: String , :: Type{Raw} ) = isfile (filename) ?
7178 Raw (Mmap. mmap (filename)) :
117124# starting at position i, return attributes up until the next '>' or '?' (DTD)
118125function get_attributes (data, i, j)
119126 i = name_start (data, i)
120- i > j && return nothing
127+ ( isnothing (j) || isnothing (i) || i > j) && return nothing
121128 out = OrderedDict {String, String} ()
122129 while ! isnothing (i) && i < j
123130 key, i = get_name (data, i)
@@ -161,7 +168,18 @@ function attributes(o::Raw)
161168 i = o. pos
162169 i = name_start (o. data, i)
163170 i = name_stop (o. data, i)
164- get_attributes (o. data, i + 1 , o. pos + o. len)
171+ out= get_attributes (o. data, i + 1 , o. pos + o. len)
172+ if o. type === RawElementOpen && ! isnothing (out) && haskey (out, " xml:space" )
173+ # If xml:space attribute is present, we need to preserve whitespace
174+ if out[" xml:space" ] == " preserve"
175+ o. ctx[1 ]= true
176+ elseif out[" xml:space" ] == " default"
177+ o. ctx[1 ] = false
178+ else
179+ error (" Invalid value for xml:space attribute: $(out[" xml:space" ]) . Must be 'preserve' or 'default'." )
180+ end
181+ end
182+ out
165183 elseif o. type === RawDeclaration
166184 get_attributes (o. data, o. pos + 6 , o. pos + o. len)
167185 else
@@ -198,7 +216,11 @@ function children(o::Raw)
198216 depth = o. depth
199217 out = Raw[]
200218 for item in xml_nodes (o)
201- item. depth == depth + 1 && push! (out, item)
219+ if item. depth == depth + 1
220+ item. ctx[1 ] = o. ctx[1 ] # inherit the context
221+ o. type== RawElementOpen && attributes (item)
222+ push! (out, item)
223+ end
202224 item. depth == depth && break
203225 o. type === RawDocument && item. depth == 2 && break # break if we've seen the doc root
204226 end
@@ -247,55 +269,64 @@ function next(o::Raw)
247269 depth = o. depth
248270 data = o. data
249271 type = o. type
250- i = findnext (! isspace, data, i) # skip insignificant whitespace
251- isnothing (i) && return nothing
272+ ctx = o. ctx
273+ k = findnext (! isspace, data, i)
274+ if (isnothing (k) || length (String (o. data[o. pos + o. len + 1 : end ]))== 0 )
275+ return nothing
276+ end
277+ i = (ctx[1 ]) ? i : k
278+ j = i + 1
279+ c = Char (o. data[k])
280+ d = Char (o. data[k+ 1 ])
252281 if type === RawElementOpen || type === RawDocument
253282 depth += 1
254283 end
255- c = Char (o. data[i])
256- j = i + 1
257- if c != = ' <'
284+ if c != = ' <' || type === RawElementOpen && d === ' /' && (ctx[1 ])
258285 type = RawText
259286 j = findnext (== (UInt8 (' <' )), data, i) - 1
260- j = findprev (! isspace, data, j) # "rstrip"
261- elseif c === ' <'
262- c2 = Char (o. data[i + 1 ])
263- if c2 === ' !'
264- c3 = Char (o. data[i + 2 ])
265- if c3 === ' -'
266- type = RawComment
267- j = findnext (Vector {UInt8} (" -->" ), data, i)[end ]
268- elseif c3 === ' ['
269- type = RawCData
270- j = findnext (Vector {UInt8} (" ]]>" ), data, i)[end ]
271- elseif c3 === ' D' || c3 == ' d'
272- type = RawDTD
273- j = findnext (== (UInt8 (' >' )), data, i)
274- while sum (== (UInt8 (' >' )), data[i: j]) != sum (== (UInt8 (' <' )), data[i: j])
275- j = findnext (== (UInt8 (' >' )), data, j + 1 )
287+ j = (ctx[1 ]) ? j : findprev (! isspace, data, j) # preserving whitespace if needed
288+ else
289+ i= k
290+ j= k+ 1
291+ if c === ' <'
292+ c2 = Char (o. data[i + 1 ])
293+ if c2 === ' !'
294+ c3 = Char (o. data[i + 2 ])
295+ if c3 === ' -'
296+ type = RawComment
297+ j = findnext (Vector {UInt8} (" -->" ), data, i)[end ]
298+ elseif c3 === ' ['
299+ type = RawCData
300+ j = findnext (Vector {UInt8} (" ]]>" ), data, i)[end ]
301+ elseif c3 === ' D' || c3 == ' d'
302+ type = RawDTD
303+ j = findnext (== (UInt8 (' >' )), data, i)
304+ while sum (== (UInt8 (' >' )), data[k: j]) != sum (== (UInt8 (' <' )), data[i: j])
305+ j = findnext (== (UInt8 (' >' )), data, j + 1 )
306+ end
276307 end
277- end
278- elseif c2 === ' ?'
279- if get_name (data, i + 2 )[1 ] == " xml"
280- type = RawDeclaration
281- else
282- type = RawProcessingInstruction
283- end
284- j = findnext (Vector {UInt8} (" ?>" ), data, i)[end ]
285- elseif c2 === ' /'
286- type = RawElementClose
287- depth -= 1
288- j = findnext (== (UInt8 (' >' )), data, i)
289- else
290- j = findnext (== (UInt8 (' >' )), data, i)
291- if data[j- 1 ] === UInt8 (' /' )
292- type = RawElementSelfClosed
308+ elseif c2 === ' ?'
309+ if get_name (data, i + 2 )[1 ] == " xml"
310+ type = RawDeclaration
311+ else
312+ type = RawProcessingInstruction
313+ end
314+ j = findnext (Vector {UInt8} (" ?>" ), data, i)[end ]
315+ elseif c2 === ' /'
316+ type = RawElementClose
317+ depth -= 1
318+ j = findnext (== (UInt8 (' >' )), data, i)
293319 else
294- type = RawElementOpen
320+ j = findnext (== (UInt8 (' >' )), data, i)
321+ if data[j- 1 ] === UInt8 (' /' )
322+ type = RawElementSelfClosed
323+ else
324+ type = RawElementOpen
325+ end
295326 end
296327 end
297328 end
298- return Raw (type, depth, i, j - i, data)
329+ return Raw (type, depth, i, j - i, data, ctx )
299330end
300331
301332# -----------------------------------------------------------------------------# prev Raw
@@ -308,52 +339,61 @@ function prev(o::Raw)
308339 depth = o. depth
309340 data = o. data
310341 type = o. type
342+ ctx = o. ctx
311343 type === RawDocument && return nothing
312344 j = o. pos - 1
313- j = findprev (! isspace, data, j) # skip insignificant whitespace
314- isnothing (j) && return Raw (data) # RawDocument
345+ k = findprev (! isspace, data, j)
346+ if isnothing (k) || length (String (o. data[o. pos + o. len + 1 : end ]))== 0
347+ return Raw (data, ctx) # RawDocument
348+ end
349+ j = (ctx[1 ]) ? j : k
315350 c = Char (o. data[j])
351+ d = Char (data[findprev (== (UInt8 (' <' )), data, j)+ 1 ])
316352 i = j - 1
317353 next_type = type
318- if c != = ' >' # text
354+ if c != = ' >' || type === RawElementClose && d != = ' / ' && (ctx[ 1 ]) # text or empty whitespace
319355 type = RawText
320- i = findprev (== (UInt8 (' >' )), data, j) + 1
321- i = findnext (! isspace, data, i) # "lstrip"
322- elseif c === ' >'
323- c2 = Char (o. data[j - 1 ])
324- if c2 === ' -'
325- type = RawComment
326- i = findprev (Vector {UInt8} (" <--" ), data, j)[1 ]
327- elseif c2 === ' ]'
328- type = RawCData
329- i = findprev (Vector {UInt8} (" <![CData[" ), data, j)[1 ]
330- elseif c2 === ' ?'
331- i = findprev (Vector {UInt8} (" <?" ), data, j)[1 ]
332- if get_name (data, i + 2 )[1 ] == " xml"
333- type = RawDeclaration
356+ i= findprev (== (UInt8 (' >' )), data, j) + 1
357+ i = (ctx[1 ]) ? i : findprev (! isspace, data, i) # If preserving whitespace, retain leading and trailing whitespace
358+ else
359+ j= k
360+ i= k- 1
361+ if c === ' >'
362+ c2 = Char (o. data[j - 1 ])
363+ if c2 === ' -'
364+ type = RawComment
365+ i = findprev (Vector {UInt8} (" <--" ), data, j)[1 ]
366+ elseif c2 === ' ]'
367+ type = RawCData
368+ i = findprev (Vector {UInt8} (" <![CData[" ), data, j)[1 ]
369+ elseif c2 === ' ?'
370+ i = findprev (Vector {UInt8} (" <?" ), data, j)[1 ]
371+ if get_name (data, i + 2 )[1 ] == " xml"
372+ type = RawDeclaration
373+ else
374+ type = RawProcessingInstruction
375+ end
334376 else
335- type = RawProcessingInstruction
336- end
377+ i = findprev (== (UInt8 (' <' )), data, j)
378+ char = Char (data[i+ 1 ])
379+ if char === ' /'
380+ type = RawElementClose
381+ elseif char === ' !'
382+ type = DTD
383+ elseif isletter (char) || char === ' _'
384+ type = Char (o. data[j - 2 ]) === ' /' ? RawElementSelfClosed : RawElementOpen
385+ else
386+ error (" Should be unreachable. Unexpected data: <$char ... $c3$c2$c1 >." )
387+ end
388+ end
337389 else
338- i = findprev (== (UInt8 (' <' )), data, j)
339- char = Char (data[i+ 1 ])
340- if char === ' /'
341- type = RawElementClose
342- elseif char === ' !'
343- type = DTD
344- elseif isletter (char) || char === ' _'
345- type = Char (o. data[j - 2 ]) === ' /' ? RawElementSelfClosed : RawElementOpen
346- else
347- error (" Should be unreachable. Unexpected data: <$char ... $c3$c2$c1 >." )
348- end
390+ error (" Unreachable reached in XML.prev" )
349391 end
350- else
351- error (" Unreachable reached in XML.prev" )
352392 end
353393 if type != = RawElementOpen && next_type === RawElementClose
354394 depth += 1
355395 elseif type == RawElementOpen && next_type != = RawElementClose
356396 depth -= 1
357397 end
358- return Raw (type, depth, i, j - i, data)
398+ return Raw (type, depth, i, j - i, data, ctx )
359399end
0 commit comments