Skip to content

Commit 120038d

Browse files
authored
Merge pull request #45 from TimG1964/Preserve-whitespace
Respect xml:space="preserve" (#43)
2 parents f259aff + 35f6ed8 commit 120038d

File tree

2 files changed

+224
-76
lines changed

2 files changed

+224
-76
lines changed

src/raw.jl

Lines changed: 116 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
x === RawDocument ? Document :
2929
nothing
3030

31+
#struct XMLSpaceContext
32+
# preserve_space::Vector{Bool} # Stack to track xml:space state
33+
#end
34+
#XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving
35+
3136
#-----------------------------------------------------------------------------# Raw
3237
"""
3338
Raw(filename::String)
@@ -64,8 +69,10 @@ struct Raw
6469
pos::Int
6570
len::Int
6671
data::Vector{UInt8}
72+
ctx::Vector{Bool} # Context for xml:space (Vector so mutable)
6773
end
68-
Raw(data::Vector{UInt8}) = Raw(RawDocument, 0, 0, 0, data)
74+
Raw(data::Vector{UInt8}, ctx=[false]) = Raw(RawDocument, 0, 0, 0, data, ctx)
75+
6976

7077
Base.read(filename::String, ::Type{Raw}) = isfile(filename) ?
7178
Raw(Mmap.mmap(filename)) :
@@ -117,7 +124,7 @@ end
117124
# starting at position i, return attributes up until the next '>' or '?' (DTD)
118125
function get_attributes(data, i, j)
119126
i = name_start(data, i)
120-
i > j && return nothing
127+
(isnothing(j) || isnothing(i) || i > j) && return nothing
121128
out = OrderedDict{String, String}()
122129
while !isnothing(i) && i < j
123130
key, i = get_name(data, i)
@@ -161,7 +168,18 @@ function attributes(o::Raw)
161168
i = o.pos
162169
i = name_start(o.data, i)
163170
i = name_stop(o.data, i)
164-
get_attributes(o.data, i + 1, o.pos + o.len)
171+
out=get_attributes(o.data, i + 1, o.pos + o.len)
172+
if o.type === RawElementOpen && !isnothing(out) && haskey(out, "xml:space")
173+
# If xml:space attribute is present, we need to preserve whitespace
174+
if out["xml:space"] == "preserve"
175+
o.ctx[1]= true
176+
elseif out["xml:space"] == "default"
177+
o.ctx[1] = false
178+
else
179+
error("Invalid value for xml:space attribute: $(out["xml:space"]). Must be 'preserve' or 'default'.")
180+
end
181+
end
182+
out
165183
elseif o.type === RawDeclaration
166184
get_attributes(o.data, o.pos + 6, o.pos + o.len)
167185
else
@@ -198,7 +216,11 @@ function children(o::Raw)
198216
depth = o.depth
199217
out = Raw[]
200218
for item in xml_nodes(o)
201-
item.depth == depth + 1 && push!(out, item)
219+
if item.depth == depth + 1
220+
item.ctx[1] = o.ctx[1] # inherit the context
221+
o.type==RawElementOpen && attributes(item)
222+
push!(out, item)
223+
end
202224
item.depth == depth && break
203225
o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root
204226
end
@@ -247,55 +269,64 @@ function next(o::Raw)
247269
depth = o.depth
248270
data = o.data
249271
type = o.type
250-
i = findnext(!isspace, data, i) # skip insignificant whitespace
251-
isnothing(i) && return nothing
272+
ctx = o.ctx
273+
k = findnext(!isspace, data, i)
274+
if (isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0)
275+
return nothing
276+
end
277+
i = (ctx[1]) ? i : k
278+
j = i + 1
279+
c = Char(o.data[k])
280+
d = Char(o.data[k+1])
252281
if type === RawElementOpen || type === RawDocument
253282
depth += 1
254283
end
255-
c = Char(o.data[i])
256-
j = i + 1
257-
if c !== '<'
284+
if c !== '<' || type === RawElementOpen && d === '/' && (ctx[1])
258285
type = RawText
259286
j = findnext(==(UInt8('<')), data, i) - 1
260-
j = findprev(!isspace, data, j) # "rstrip"
261-
elseif c === '<'
262-
c2 = Char(o.data[i + 1])
263-
if c2 === '!'
264-
c3 = Char(o.data[i + 2])
265-
if c3 === '-'
266-
type = RawComment
267-
j = findnext(Vector{UInt8}("-->"), data, i)[end]
268-
elseif c3 === '['
269-
type = RawCData
270-
j = findnext(Vector{UInt8}("]]>"), data, i)[end]
271-
elseif c3 === 'D' || c3 == 'd'
272-
type = RawDTD
273-
j = findnext(==(UInt8('>')), data, i)
274-
while sum(==(UInt8('>')), data[i:j]) != sum(==(UInt8('<')), data[i:j])
275-
j = findnext(==(UInt8('>')), data, j + 1)
287+
j = (ctx[1]) ? j : findprev(!isspace, data, j) # preserving whitespace if needed
288+
else
289+
i=k
290+
j=k+1
291+
if c === '<'
292+
c2 = Char(o.data[i + 1])
293+
if c2 === '!'
294+
c3 = Char(o.data[i + 2])
295+
if c3 === '-'
296+
type = RawComment
297+
j = findnext(Vector{UInt8}("-->"), data, i)[end]
298+
elseif c3 === '['
299+
type = RawCData
300+
j = findnext(Vector{UInt8}("]]>"), data, i)[end]
301+
elseif c3 === 'D' || c3 == 'd'
302+
type = RawDTD
303+
j = findnext(==(UInt8('>')), data, i)
304+
while sum(==(UInt8('>')), data[k:j]) != sum(==(UInt8('<')), data[i:j])
305+
j = findnext(==(UInt8('>')), data, j + 1)
306+
end
276307
end
277-
end
278-
elseif c2 === '?'
279-
if get_name(data, i + 2)[1] == "xml"
280-
type = RawDeclaration
281-
else
282-
type = RawProcessingInstruction
283-
end
284-
j = findnext(Vector{UInt8}("?>"), data, i)[end]
285-
elseif c2 === '/'
286-
type = RawElementClose
287-
depth -= 1
288-
j = findnext(==(UInt8('>')), data, i)
289-
else
290-
j = findnext(==(UInt8('>')), data, i)
291-
if data[j-1] === UInt8('/')
292-
type = RawElementSelfClosed
308+
elseif c2 === '?'
309+
if get_name(data, i + 2)[1] == "xml"
310+
type = RawDeclaration
311+
else
312+
type = RawProcessingInstruction
313+
end
314+
j = findnext(Vector{UInt8}("?>"), data, i)[end]
315+
elseif c2 === '/'
316+
type = RawElementClose
317+
depth -= 1
318+
j = findnext(==(UInt8('>')), data, i)
293319
else
294-
type = RawElementOpen
320+
j = findnext(==(UInt8('>')), data, i)
321+
if data[j-1] === UInt8('/')
322+
type = RawElementSelfClosed
323+
else
324+
type = RawElementOpen
325+
end
295326
end
296327
end
297328
end
298-
return Raw(type, depth, i, j - i, data)
329+
return Raw(type, depth, i, j - i, data, ctx)
299330
end
300331

301332
#-----------------------------------------------------------------------------# prev Raw
@@ -308,52 +339,61 @@ function prev(o::Raw)
308339
depth = o.depth
309340
data = o.data
310341
type = o.type
342+
ctx = o.ctx
311343
type === RawDocument && return nothing
312344
j = o.pos - 1
313-
j = findprev(!isspace, data, j) # skip insignificant whitespace
314-
isnothing(j) && return Raw(data) # RawDocument
345+
k = findprev(!isspace, data, j)
346+
if isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0
347+
return Raw(data, ctx) # RawDocument
348+
end
349+
j = (ctx[1]) ? j : k
315350
c = Char(o.data[j])
351+
d = Char(data[findprev(==(UInt8('<')), data, j)+1])
316352
i = j - 1
317353
next_type = type
318-
if c !== '>' # text
354+
if c !== '>' || type === RawElementClose && d !== '/' && (ctx[1]) # text or empty whitespace
319355
type = RawText
320-
i = findprev(==(UInt8('>')), data, j) + 1
321-
i = findnext(!isspace, data, i) # "lstrip"
322-
elseif c === '>'
323-
c2 = Char(o.data[j - 1])
324-
if c2 === '-'
325-
type = RawComment
326-
i = findprev(Vector{UInt8}("<--"), data, j)[1]
327-
elseif c2 === ']'
328-
type = RawCData
329-
i = findprev(Vector{UInt8}("<![CData["), data, j)[1]
330-
elseif c2 === '?'
331-
i = findprev(Vector{UInt8}("<?"), data, j)[1]
332-
if get_name(data, i + 2)[1] == "xml"
333-
type = RawDeclaration
356+
i=findprev(==(UInt8('>')), data, j) + 1
357+
i = (ctx[1]) ? i : findprev(!isspace, data, i) # If preserving whitespace, retain leading and trailing whitespace
358+
else
359+
j=k
360+
i=k-1
361+
if c === '>'
362+
c2 = Char(o.data[j - 1])
363+
if c2 === '-'
364+
type = RawComment
365+
i = findprev(Vector{UInt8}("<--"), data, j)[1]
366+
elseif c2 === ']'
367+
type = RawCData
368+
i = findprev(Vector{UInt8}("<![CData["), data, j)[1]
369+
elseif c2 === '?'
370+
i = findprev(Vector{UInt8}("<?"), data, j)[1]
371+
if get_name(data, i + 2)[1] == "xml"
372+
type = RawDeclaration
373+
else
374+
type = RawProcessingInstruction
375+
end
334376
else
335-
type = RawProcessingInstruction
336-
end
377+
i = findprev(==(UInt8('<')), data, j)
378+
char = Char(data[i+1])
379+
if char === '/'
380+
type = RawElementClose
381+
elseif char === '!'
382+
type = DTD
383+
elseif isletter(char) || char === '_'
384+
type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen
385+
else
386+
error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.")
387+
end
388+
end
337389
else
338-
i = findprev(==(UInt8('<')), data, j)
339-
char = Char(data[i+1])
340-
if char === '/'
341-
type = RawElementClose
342-
elseif char === '!'
343-
type = DTD
344-
elseif isletter(char) || char === '_'
345-
type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen
346-
else
347-
error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.")
348-
end
390+
error("Unreachable reached in XML.prev")
349391
end
350-
else
351-
error("Unreachable reached in XML.prev")
352392
end
353393
if type !== RawElementOpen && next_type === RawElementClose
354394
depth += 1
355395
elseif type == RawElementOpen && next_type !== RawElementClose
356396
depth -= 1
357397
end
358-
return Raw(type, depth, i, j - i, data)
398+
return Raw(type, depth, i, j - i, data, ctx)
359399
end

test/runtests.jl

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,114 @@ end
174174
end
175175
end
176176

177+
#-----------------------------------------------------------------------------# Preserve whitespace
178+
@testset "xml:space" begin
179+
@testset "Basic xml:space functionality" begin
180+
181+
# Test 1: xml:space="preserve" should preserve entirely empty whitespace
182+
xml1 = """<root><text xml:space="preserve"> </text></root>"""
183+
doc1 = parse(XML.Node, xml1)
184+
text_content = XML.value(doc1[1][1][1])
185+
@test text_content == " "
186+
187+
# Test 2: xml:space="preserve" should preserve leading and trailing whitespace
188+
xml2 = """<root><text xml:space="preserve"> leading and trailing spaces </text></root>"""
189+
doc2 = parse(XML.Node, xml2)
190+
text_content = XML.value(doc2[1][1][1])
191+
@test text_content == " leading and trailing spaces "
192+
193+
# Test 3: Without xml:space, entirely empty whitespace should create a self closing node
194+
xml3 = """<root><text> </text></root>"""
195+
doc3 = XML.parse(XML.Node, xml3)
196+
text_content = XML.write(doc3[1][1])
197+
@test text_content == "<text/>"
198+
199+
# Test 4: Without xml:space, whitespace should be normalized
200+
xml4 = """<root><text> gets normalized </text></root>"""
201+
doc4 = XML.parse(XML.Node, xml4)
202+
text_content = XML.value(doc4[1][1][1])
203+
@test text_content == "gets normalized"
204+
205+
# Test 5: xml:space="default" should normalize even with preserve_xml_space=true
206+
xml5 = """<root><text xml:space="default"> gets normalized </text></root>"""
207+
doc5 = XML.parse(XML.Node, xml5)
208+
text_content = XML.value(doc5[1][1][1])
209+
@test text_content == "gets normalized"
210+
end
211+
212+
@testset "xml:space inheritance" begin
213+
# Test 6: Children inherit parent's xml:space="preserve"
214+
xml6 = """<root xml:space="preserve">
215+
<parent> parent text
216+
<child> child text </child>
217+
</parent>
218+
</root>"""
219+
doc6 = XML.parse(XML.Node, xml6)
220+
# Both parent and child should preserve whitespace
221+
@test contains(XML.value(doc6[1][1][1]), "parent text \n")
222+
@test XML.value(doc6[1][1][2][1]) == " child text "
223+
224+
# Test 7: xml:space="default" overrides parent's "preserve"
225+
xml7 = """<root xml:space="preserve">
226+
<child xml:space="default"> normalized despite parent </child>
227+
</root>"""
228+
doc7 = XML.parse(XML.Node, xml7)
229+
@test XML.value(doc7[1][1][1]) == "normalized despite parent"
230+
end
231+
232+
@testset "Nesting scenarios" begin
233+
# Test 8: Multiple levels of xml:space changes
234+
xml8 = """<root xml:space="preserve">
235+
<level1> preserved
236+
<level2 xml:space="default"> normalized
237+
<level3 xml:space="preserve"> preserved again </level3>
238+
</level2>
239+
</level1>
240+
</root>"""
241+
doc8 = XML.parse(XML.Node, xml8)
242+
243+
# level1 should preserve (inherits from root)
244+
level1_text = XML.value(doc8[1][1][1])
245+
@test level1_text == " preserved \n "
246+
247+
# level2 should normalize (explicit xml:space="default")
248+
level2_text = XML.value(doc8[1][1][2][1])
249+
@test level2_text == "normalized"
250+
251+
# level3 should preserve (explicit xml:space="preserve")
252+
level3_text = XML.value(doc8[1][1][2][2][1])
253+
@test level3_text == " preserved again "
254+
255+
# Test 9: repeated multiple levels of xml:space changes
256+
xml9 = """<root xml:space="preserve">
257+
<level1> preserved
258+
<level2 xml:space="default"> normalized
259+
<level3 xml:space="preserve"> preserved again </level3>
260+
</level2>
261+
</level1>
262+
<level1b> preserved b
263+
<level2b xml:space="default"> normalized b
264+
<level3b xml:space="preserve"> preserved again b </level3b>
265+
</level2b>
266+
</level1b>
267+
</root>"""
268+
doc9 = XML.parse(XML.Node, xml9)
269+
270+
# level1b should preserve (inherits from root)
271+
level1b_text = XML.value(doc9[1][2][1])
272+
@test level1b_text == " preserved b \n "
273+
274+
# level2 should normalize (explicit xml:space="default")
275+
level2b_text = XML.value(doc9[1][2][2][1])
276+
@test level2b_text == "normalized b"
277+
278+
# level3 should preserve (explicit xml:space="preserve")
279+
level3b_text = XML.value(doc9[1][2][2][2][1])
280+
@test level3b_text == " preserved again b "
281+
282+
end
283+
end
284+
177285
#-----------------------------------------------------------------------------# roundtrip
178286
@testset "read/write/read roundtrip" begin
179287
for path in all_files

0 commit comments

Comments
 (0)