28
28
x === RawDocument ? Document :
29
29
nothing
30
30
31
+ # struct XMLSpaceContext
32
+ # preserve_space::Vector{Bool} # Stack to track xml:space state
33
+ # end
34
+ # XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving
35
+
31
36
# -----------------------------------------------------------------------------# Raw
32
37
"""
33
38
Raw(filename::String)
@@ -64,8 +69,10 @@ struct Raw
64
69
pos:: Int
65
70
len:: Int
66
71
data:: Vector{UInt8}
72
+ ctx:: Vector{Bool} # Context for xml:space (Vector so mutable)
67
73
end
68
- Raw (data:: Vector{UInt8} ) = Raw (RawDocument, 0 , 0 , 0 , data)
74
+ Raw (data:: Vector{UInt8} , ctx= [false ]) = Raw (RawDocument, 0 , 0 , 0 , data, ctx)
75
+
69
76
70
77
Base. read (filename:: String , :: Type{Raw} ) = isfile (filename) ?
71
78
Raw (Mmap. mmap (filename)) :
117
124
# starting at position i, return attributes up until the next '>' or '?' (DTD)
118
125
function get_attributes (data, i, j)
119
126
i = name_start (data, i)
120
- i > j && return nothing
127
+ ( isnothing (j) || isnothing (i) || i > j) && return nothing
121
128
out = OrderedDict {String, String} ()
122
129
while ! isnothing (i) && i < j
123
130
key, i = get_name (data, i)
@@ -161,7 +168,18 @@ function attributes(o::Raw)
161
168
i = o. pos
162
169
i = name_start (o. data, i)
163
170
i = name_stop (o. data, i)
164
- get_attributes (o. data, i + 1 , o. pos + o. len)
171
+ out= get_attributes (o. data, i + 1 , o. pos + o. len)
172
+ if o. type === RawElementOpen && ! isnothing (out) && haskey (out, " xml:space" )
173
+ # If xml:space attribute is present, we need to preserve whitespace
174
+ if out[" xml:space" ] == " preserve"
175
+ o. ctx[1 ]= true
176
+ elseif out[" xml:space" ] == " default"
177
+ o. ctx[1 ] = false
178
+ else
179
+ error (" Invalid value for xml:space attribute: $(out[" xml:space" ]) . Must be 'preserve' or 'default'." )
180
+ end
181
+ end
182
+ out
165
183
elseif o. type === RawDeclaration
166
184
get_attributes (o. data, o. pos + 6 , o. pos + o. len)
167
185
else
@@ -198,7 +216,11 @@ function children(o::Raw)
198
216
depth = o. depth
199
217
out = Raw[]
200
218
for item in xml_nodes (o)
201
- item. depth == depth + 1 && push! (out, item)
219
+ if item. depth == depth + 1
220
+ item. ctx[1 ] = o. ctx[1 ] # inherit the context
221
+ o. type== RawElementOpen && attributes (item)
222
+ push! (out, item)
223
+ end
202
224
item. depth == depth && break
203
225
o. type === RawDocument && item. depth == 2 && break # break if we've seen the doc root
204
226
end
@@ -247,55 +269,64 @@ function next(o::Raw)
247
269
depth = o. depth
248
270
data = o. data
249
271
type = o. type
250
- i = findnext (! isspace, data, i) # skip insignificant whitespace
251
- isnothing (i) && return nothing
272
+ ctx = o. ctx
273
+ k = findnext (! isspace, data, i)
274
+ if (isnothing (k) || length (String (o. data[o. pos + o. len + 1 : end ]))== 0 )
275
+ return nothing
276
+ end
277
+ i = (ctx[1 ]) ? i : k
278
+ j = i + 1
279
+ c = Char (o. data[k])
280
+ d = Char (o. data[k+ 1 ])
252
281
if type === RawElementOpen || type === RawDocument
253
282
depth += 1
254
283
end
255
- c = Char (o. data[i])
256
- j = i + 1
257
- if c != = ' <'
284
+ if c != = ' <' || type === RawElementOpen && d === ' /' && (ctx[1 ])
258
285
type = RawText
259
286
j = findnext (== (UInt8 (' <' )), data, i) - 1
260
- j = findprev (! isspace, data, j) # "rstrip"
261
- elseif c === ' <'
262
- c2 = Char (o. data[i + 1 ])
263
- if c2 === ' !'
264
- c3 = Char (o. data[i + 2 ])
265
- if c3 === ' -'
266
- type = RawComment
267
- j = findnext (Vector {UInt8} (" -->" ), data, i)[end ]
268
- elseif c3 === ' ['
269
- type = RawCData
270
- j = findnext (Vector {UInt8} (" ]]>" ), data, i)[end ]
271
- elseif c3 === ' D' || c3 == ' d'
272
- type = RawDTD
273
- j = findnext (== (UInt8 (' >' )), data, i)
274
- while sum (== (UInt8 (' >' )), data[i: j]) != sum (== (UInt8 (' <' )), data[i: j])
275
- j = findnext (== (UInt8 (' >' )), data, j + 1 )
287
+ j = (ctx[1 ]) ? j : findprev (! isspace, data, j) # preserving whitespace if needed
288
+ else
289
+ i= k
290
+ j= k+ 1
291
+ if c === ' <'
292
+ c2 = Char (o. data[i + 1 ])
293
+ if c2 === ' !'
294
+ c3 = Char (o. data[i + 2 ])
295
+ if c3 === ' -'
296
+ type = RawComment
297
+ j = findnext (Vector {UInt8} (" -->" ), data, i)[end ]
298
+ elseif c3 === ' ['
299
+ type = RawCData
300
+ j = findnext (Vector {UInt8} (" ]]>" ), data, i)[end ]
301
+ elseif c3 === ' D' || c3 == ' d'
302
+ type = RawDTD
303
+ j = findnext (== (UInt8 (' >' )), data, i)
304
+ while sum (== (UInt8 (' >' )), data[k: j]) != sum (== (UInt8 (' <' )), data[i: j])
305
+ j = findnext (== (UInt8 (' >' )), data, j + 1 )
306
+ end
276
307
end
277
- end
278
- elseif c2 === ' ?'
279
- if get_name (data, i + 2 )[1 ] == " xml"
280
- type = RawDeclaration
281
- else
282
- type = RawProcessingInstruction
283
- end
284
- j = findnext (Vector {UInt8} (" ?>" ), data, i)[end ]
285
- elseif c2 === ' /'
286
- type = RawElementClose
287
- depth -= 1
288
- j = findnext (== (UInt8 (' >' )), data, i)
289
- else
290
- j = findnext (== (UInt8 (' >' )), data, i)
291
- if data[j- 1 ] === UInt8 (' /' )
292
- type = RawElementSelfClosed
308
+ elseif c2 === ' ?'
309
+ if get_name (data, i + 2 )[1 ] == " xml"
310
+ type = RawDeclaration
311
+ else
312
+ type = RawProcessingInstruction
313
+ end
314
+ j = findnext (Vector {UInt8} (" ?>" ), data, i)[end ]
315
+ elseif c2 === ' /'
316
+ type = RawElementClose
317
+ depth -= 1
318
+ j = findnext (== (UInt8 (' >' )), data, i)
293
319
else
294
- type = RawElementOpen
320
+ j = findnext (== (UInt8 (' >' )), data, i)
321
+ if data[j- 1 ] === UInt8 (' /' )
322
+ type = RawElementSelfClosed
323
+ else
324
+ type = RawElementOpen
325
+ end
295
326
end
296
327
end
297
328
end
298
- return Raw (type, depth, i, j - i, data)
329
+ return Raw (type, depth, i, j - i, data, ctx )
299
330
end
300
331
301
332
# -----------------------------------------------------------------------------# prev Raw
@@ -308,52 +339,61 @@ function prev(o::Raw)
308
339
depth = o. depth
309
340
data = o. data
310
341
type = o. type
342
+ ctx = o. ctx
311
343
type === RawDocument && return nothing
312
344
j = o. pos - 1
313
- j = findprev (! isspace, data, j) # skip insignificant whitespace
314
- isnothing (j) && return Raw (data) # RawDocument
345
+ k = findprev (! isspace, data, j)
346
+ if isnothing (k) || length (String (o. data[o. pos + o. len + 1 : end ]))== 0
347
+ return Raw (data, ctx) # RawDocument
348
+ end
349
+ j = (ctx[1 ]) ? j : k
315
350
c = Char (o. data[j])
351
+ d = Char (data[findprev (== (UInt8 (' <' )), data, j)+ 1 ])
316
352
i = j - 1
317
353
next_type = type
318
- if c != = ' >' # text
354
+ if c != = ' >' || type === RawElementClose && d != = ' / ' && (ctx[ 1 ]) # text or empty whitespace
319
355
type = RawText
320
- i = findprev (== (UInt8 (' >' )), data, j) + 1
321
- i = findnext (! isspace, data, i) # "lstrip"
322
- elseif c === ' >'
323
- c2 = Char (o. data[j - 1 ])
324
- if c2 === ' -'
325
- type = RawComment
326
- i = findprev (Vector {UInt8} (" <--" ), data, j)[1 ]
327
- elseif c2 === ' ]'
328
- type = RawCData
329
- i = findprev (Vector {UInt8} (" <![CData[" ), data, j)[1 ]
330
- elseif c2 === ' ?'
331
- i = findprev (Vector {UInt8} (" <?" ), data, j)[1 ]
332
- if get_name (data, i + 2 )[1 ] == " xml"
333
- type = RawDeclaration
356
+ i= findprev (== (UInt8 (' >' )), data, j) + 1
357
+ i = (ctx[1 ]) ? i : findprev (! isspace, data, i) # If preserving whitespace, retain leading and trailing whitespace
358
+ else
359
+ j= k
360
+ i= k- 1
361
+ if c === ' >'
362
+ c2 = Char (o. data[j - 1 ])
363
+ if c2 === ' -'
364
+ type = RawComment
365
+ i = findprev (Vector {UInt8} (" <--" ), data, j)[1 ]
366
+ elseif c2 === ' ]'
367
+ type = RawCData
368
+ i = findprev (Vector {UInt8} (" <![CData[" ), data, j)[1 ]
369
+ elseif c2 === ' ?'
370
+ i = findprev (Vector {UInt8} (" <?" ), data, j)[1 ]
371
+ if get_name (data, i + 2 )[1 ] == " xml"
372
+ type = RawDeclaration
373
+ else
374
+ type = RawProcessingInstruction
375
+ end
334
376
else
335
- type = RawProcessingInstruction
336
- end
377
+ i = findprev (== (UInt8 (' <' )), data, j)
378
+ char = Char (data[i+ 1 ])
379
+ if char === ' /'
380
+ type = RawElementClose
381
+ elseif char === ' !'
382
+ type = DTD
383
+ elseif isletter (char) || char === ' _'
384
+ type = Char (o. data[j - 2 ]) === ' /' ? RawElementSelfClosed : RawElementOpen
385
+ else
386
+ error (" Should be unreachable. Unexpected data: <$char ... $c3$c2$c1 >." )
387
+ end
388
+ end
337
389
else
338
- i = findprev (== (UInt8 (' <' )), data, j)
339
- char = Char (data[i+ 1 ])
340
- if char === ' /'
341
- type = RawElementClose
342
- elseif char === ' !'
343
- type = DTD
344
- elseif isletter (char) || char === ' _'
345
- type = Char (o. data[j - 2 ]) === ' /' ? RawElementSelfClosed : RawElementOpen
346
- else
347
- error (" Should be unreachable. Unexpected data: <$char ... $c3$c2$c1 >." )
348
- end
390
+ error (" Unreachable reached in XML.prev" )
349
391
end
350
- else
351
- error (" Unreachable reached in XML.prev" )
352
392
end
353
393
if type != = RawElementOpen && next_type === RawElementClose
354
394
depth += 1
355
395
elseif type == RawElementOpen && next_type != = RawElementClose
356
396
depth -= 1
357
397
end
358
- return Raw (type, depth, i, j - i, data)
398
+ return Raw (type, depth, i, j - i, data, ctx )
359
399
end
0 commit comments