From bcee4b744fd237527ad8f25cec90e7935c379c62 Mon Sep 17 00:00:00 2001 From: Tim Gebbels Date: Fri, 27 Jun 2025 13:30:26 +0100 Subject: [PATCH 1/4] Respect xml:space="preserve" (#43) --- src/raw.jl | 206 ++++++++++++++++++++++++++++++----------------- test/runtests.jl | 110 ++++++++++++++++++++++++- 2 files changed, 239 insertions(+), 77 deletions(-) diff --git a/src/raw.jl b/src/raw.jl index b2b44f5..36922c7 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -28,6 +28,11 @@ x === RawDocument ? Document : nothing +struct XMLSpaceContext + preserve_space::Vector{Bool} # Stack to track xml:space state +end +XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving + #-----------------------------------------------------------------------------# Raw """ Raw(filename::String) @@ -64,8 +69,10 @@ struct Raw pos::Int len::Int data::Vector{UInt8} + ctx::XMLSpaceContext end -Raw(data::Vector{UInt8}) = Raw(RawDocument, 0, 0, 0, data) +Raw(data::Vector{UInt8}, ctx=XMLSpaceContext()) = Raw(RawDocument, 0, 0, 0, data, ctx) + Base.read(filename::String, ::Type{Raw}) = isfile(filename) ? Raw(Mmap.mmap(filename)) : @@ -117,7 +124,7 @@ end # starting at position i, return attributes up until the next '>' or '?' (DTD) function get_attributes(data, i, j) i = name_start(data, i) - i > j && return nothing + (isnothing(j) || isnothing(i) || i > j) && return nothing out = OrderedDict{String, String}() while !isnothing(i) && i < j key, i = get_name(data, i) @@ -161,7 +168,26 @@ function attributes(o::Raw) i = o.pos i = name_start(o.data, i) i = name_stop(o.data, i) - get_attributes(o.data, i + 1, o.pos + o.len) + out=get_attributes(o.data, i + 1, o.pos + o.len) + if !isnothing(out) && haskey(out, "xml:space") + # If xml:space attribute is present, we need to preserve whitespace + if out["xml:space"] == "preserve" + push!(o.ctx.preserve_space, true) + elseif out["xml:space"] == "default" + push!(o.ctx.preserve_space, false) + else + error("Invalid value for xml:space attribute: $(out["xml:space"]). Must be 'preserve' or 'default'.") + end + end + out + + elseif o.type === RawText + if length(o.ctx.preserve_space)>0 + push!(o.ctx.preserve_space, o.ctx.preserve_space[end]) + else + push!(o.ctx.preserve_space, false) + end + nothing elseif o.type === RawDeclaration get_attributes(o.data, o.pos + 6, o.pos + o.len) else @@ -198,7 +224,15 @@ function children(o::Raw) depth = o.depth out = Raw[] for item in xml_nodes(o) - item.depth == depth + 1 && push!(out, item) + if item.depth == depth + 1 + if length(item.ctx.preserve_space) > 0 + item.ctx.preserve_space[1] = o.ctx.preserve_space[end] # inherit the context + else + push!(item.ctx.preserve_space, false) + end + o.type==RawElementOpen && attributes(item) + push!(out, item) + end item.depth == depth && break o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root end @@ -247,55 +281,65 @@ function next(o::Raw) depth = o.depth data = o.data type = o.type - i = findnext(!isspace, data, i) # skip insignificant whitespace - isnothing(i) && return nothing + ctx = o.ctx + k = findnext(!isspace, data, i) + if (isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0) + length(ctx.preserve_space)>0 && pop!(ctx.preserve_space) # pop the previous context + return nothing + end + i = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? k : i + j = i + 1 + c = Char(o.data[k]) + d = Char(o.data[k+1]) if type === RawElementOpen || type === RawDocument depth += 1 end - c = Char(o.data[i]) - j = i + 1 - if c !== '<' + if c !== '<' || type === RawElementOpen && d === '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end]) type = RawText j = findnext(==(UInt8('<')), data, i) - 1 - j = findprev(!isspace, data, j) # "rstrip" - elseif c === '<' - c2 = Char(o.data[i + 1]) - if c2 === '!' - c3 = Char(o.data[i + 2]) - if c3 === '-' - type = RawComment - j = findnext(Vector{UInt8}("-->"), data, i)[end] - elseif c3 === '[' - type = RawCData - j = findnext(Vector{UInt8}("]]>"), data, i)[end] - elseif c3 === 'D' || c3 == 'd' - type = RawDTD - j = findnext(==(UInt8('>')), data, i) - while sum(==(UInt8('>')), data[i:j]) != sum(==(UInt8('<')), data[i:j]) - j = findnext(==(UInt8('>')), data, j + 1) + j = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? findprev(!isspace, data, j) : j # preserving whitespace if needed + else + i=k + j=k+1 + if c === '<' + c2 = Char(o.data[i + 1]) + if c2 === '!' + c3 = Char(o.data[i + 2]) + if c3 === '-' + type = RawComment + j = findnext(Vector{UInt8}("-->"), data, i)[end] + elseif c3 === '[' + type = RawCData + j = findnext(Vector{UInt8}("]]>"), data, i)[end] + elseif c3 === 'D' || c3 == 'd' + type = RawDTD + j = findnext(==(UInt8('>')), data, i) + while sum(==(UInt8('>')), data[k:j]) != sum(==(UInt8('<')), data[i:j]) + j = findnext(==(UInt8('>')), data, j + 1) + end end - end - elseif c2 === '?' - if get_name(data, i + 2)[1] == "xml" - type = RawDeclaration - else - type = RawProcessingInstruction - end - j = findnext(Vector{UInt8}("?>"), data, i)[end] - elseif c2 === '/' - type = RawElementClose - depth -= 1 - j = findnext(==(UInt8('>')), data, i) - else - j = findnext(==(UInt8('>')), data, i) - if data[j-1] === UInt8('/') - type = RawElementSelfClosed + elseif c2 === '?' + if get_name(data, i + 2)[1] == "xml" + type = RawDeclaration + else + type = RawProcessingInstruction + end + j = findnext(Vector{UInt8}("?>"), data, i)[end] + elseif c2 === '/' + type = RawElementClose + depth -= 1 + j = findnext(==(UInt8('>')), data, i) else - type = RawElementOpen + j = findnext(==(UInt8('>')), data, i) + if data[j-1] === UInt8('/') + type = RawElementSelfClosed + else + type = RawElementOpen + end end end end - return Raw(type, depth, i, j - i, data) + return Raw(type, depth, i, j - i, data, ctx) end #-----------------------------------------------------------------------------# prev Raw @@ -308,52 +352,62 @@ function prev(o::Raw) depth = o.depth data = o.data type = o.type + ctx = o.ctx type === RawDocument && return nothing j = o.pos - 1 - j = findprev(!isspace, data, j) # skip insignificant whitespace - isnothing(j) && return Raw(data) # RawDocument + k = findprev(!isspace, data, j) + if isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0 + length(ctx.preserve_space)>0 && pop!(ctx.preserve_space) # pop the previous context + return Raw(data) # RawDocument + end + j = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? k : j c = Char(o.data[j]) + d = Char(data[findprev(==(UInt8('<')), data, j)+1]) i = j - 1 next_type = type - if c !== '>' # text + if c !== '>' || type === RawElementClose && d === '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end]) # text or empty whitespace type = RawText - i = findprev(==(UInt8('>')), data, j) + 1 - i = findnext(!isspace, data, i) # "lstrip" - elseif c === '>' - c2 = Char(o.data[j - 1]) - if c2 === '-' - type = RawComment - i = findprev(Vector{UInt8}("<--"), data, j)[1] - elseif c2 === ']' - type = RawCData - i = findprev(Vector{UInt8}("')), data, j) + 1 + i = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? findprev(!isspace, data, i) : i # If preserving whitespace, retain leading and trailing whitespace + else + j=k + i=k-1 + if c === '>' + c2 = Char(o.data[j - 1]) + if c2 === '-' + type = RawComment + i = findprev(Vector{UInt8}("<--"), data, j)[1] + elseif c2 === ']' + type = RawCData + i = findprev(Vector{UInt8}(".") + end + end else - i = findprev(==(UInt8('<')), data, j) - char = Char(data[i+1]) - if char === '/' - type = RawElementClose - elseif char === '!' - type = DTD - elseif isletter(char) || char === '_' - type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen - else - error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.") - end + error("Unreachable reached in XML.prev") end - else - error("Unreachable reached in XML.prev") end if type !== RawElementOpen && next_type === RawElementClose depth += 1 elseif type == RawElementOpen && next_type !== RawElementClose depth -= 1 end - return Raw(type, depth, i, j - i, data) + return Raw(type, depth, i, j - i, data, ctx) end diff --git a/test/runtests.jl b/test/runtests.jl index d41924b..a04cc4a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -116,7 +116,7 @@ end @test String(doc[end]) == "" @testset "next and prev" begin - @test XML.prev(doc[1]) === data + @test XML.prev(doc[1]) == data @test prev(data) === nothing @test XML.next(doc[end]) === nothing @@ -174,6 +174,114 @@ end end end +#-----------------------------------------------------------------------------# Preserve whitespace +@testset "xml:space" begin + @testset "Basic xml:space functionality" begin + + # Test 1: xml:space="preserve" should preserve entirely empty whitespace + xml1 = """ """ + doc1 = parse(XML.Node, xml1) + text_content = XML.value(doc1[1][1][1]) + @test text_content == " " + + # Test 2: xml:space="preserve" should preserve leading and trailing whitespace + xml2 = """ leading and trailing spaces """ + doc2 = parse(XML.Node, xml2) + text_content = XML.value(doc2[1][1][1]) + @test text_content == " leading and trailing spaces " + + # Test 3: Without xml:space, entirely empty whitespace should create a self closing node + xml3 = """ """ + doc3 = XML.parse(XML.Node, xml3) + text_content = XML.write(doc3[1][1]) + @test text_content == "" + + # Test 4: Without xml:space, whitespace should be normalized + xml4 = """ gets normalized """ + doc4 = XML.parse(XML.Node, xml4) + text_content = XML.value(doc4[1][1][1]) + @test text_content == "gets normalized" + + # Test 5: xml:space="default" should normalize even with preserve_xml_space=true + xml5 = """ gets normalized """ + doc5 = XML.parse(XML.Node, xml5) + text_content = XML.value(doc5[1][1][1]) + @test text_content == "gets normalized" + end + + @testset "xml:space inheritance" begin + # Test 6: Children inherit parent's xml:space="preserve" + xml6 = """ + parent text + child text + + """ + doc6 = XML.parse(XML.Node, xml6) + # Both parent and child should preserve whitespace + @test contains(XML.value(doc6[1][1][1]), "parent text \n") + @test XML.value(doc6[1][1][2][1]) == " child text " + + # Test 7: xml:space="default" overrides parent's "preserve" + xml7 = """ + normalized despite parent + """ + doc7 = XML.parse(XML.Node, xml7) + @test XML.value(doc7[1][1][1]) == "normalized despite parent" + end + + @testset "Nesting scenarios" begin + # Test 8: Multiple levels of xml:space changes + xml8 = """ + preserved + normalized + preserved again + + + """ + doc8 = XML.parse(XML.Node, xml8) + + # level1 should preserve (inherits from root) + level1_text = XML.value(doc8[1][1][1]) + @test level1_text == " preserved \n " + + # level2 should normalize (explicit xml:space="default") + level2_text = XML.value(doc8[1][1][2][1]) + @test level2_text == "normalized" + + # level3 should preserve (explicit xml:space="preserve") + level3_text = XML.value(doc8[1][1][2][2][1]) + @test level3_text == " preserved again " + + # Test 9: repeated multiple levels of xml:space changes + xml9 = """ + preserved + normalized + preserved again + + + preserved b + normalized b + preserved again b + + + """ + doc9 = XML.parse(XML.Node, xml9) + + # level1b should preserve (inherits from root) + level1b_text = XML.value(doc9[1][2][1]) + @test level1b_text == " preserved b \n " + + # level2 should normalize (explicit xml:space="default") + level2b_text = XML.value(doc9[1][2][2][1]) + @test level2b_text == "normalized b" + + # level3 should preserve (explicit xml:space="preserve") + level3b_text = XML.value(doc9[1][2][2][2][1]) + @test level3b_text == " preserved again b " + + end +end + #-----------------------------------------------------------------------------# roundtrip @testset "read/write/read roundtrip" begin for path in all_files From 408c064139a4a27f444ac5a9450be2aae379cffc Mon Sep 17 00:00:00 2001 From: Tim Gebbels Date: Fri, 27 Jun 2025 18:17:16 +0100 Subject: [PATCH 2/4] Minor type in aparently immaterial test --- src/raw.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/raw.jl b/src/raw.jl index 36922c7..e849993 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -365,7 +365,7 @@ function prev(o::Raw) d = Char(data[findprev(==(UInt8('<')), data, j)+1]) i = j - 1 next_type = type - if c !== '>' || type === RawElementClose && d === '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end]) # text or empty whitespace + if c !== '>' || type === RawElementClose && d !== '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end]) # text or empty whitespace type = RawText i=findprev(==(UInt8('>')), data, j) + 1 i = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? findprev(!isspace, data, i) : i # If preserving whitespace, retain leading and trailing whitespace From bf1a5cdfc0348bfe4d91b31198229369dad4288a Mon Sep 17 00:00:00 2001 From: Tim Gebbels Date: Fri, 27 Jun 2025 18:22:33 +0100 Subject: [PATCH 3/4] Reinstate `===` for `prev` test --- src/raw.jl | 2 +- test/runtests.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/raw.jl b/src/raw.jl index e849993..d3abda5 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -358,7 +358,7 @@ function prev(o::Raw) k = findprev(!isspace, data, j) if isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0 length(ctx.preserve_space)>0 && pop!(ctx.preserve_space) # pop the previous context - return Raw(data) # RawDocument + return Raw(data, ctx) # RawDocument end j = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? k : j c = Char(o.data[j]) diff --git a/test/runtests.jl b/test/runtests.jl index a04cc4a..2418c54 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -116,7 +116,7 @@ end @test String(doc[end]) == "" @testset "next and prev" begin - @test XML.prev(doc[1]) == data + @test XML.prev(doc[1]) === data @test prev(data) === nothing @test XML.next(doc[end]) === nothing From 35f6ed849014b1205a2234b175976deb80d26535 Mon Sep 17 00:00:00 2001 From: Tim Gebbels Date: Fri, 27 Jun 2025 23:32:11 +0100 Subject: [PATCH 4/4] After hours tidy-up of superfluous code! --- src/raw.jl | 46 ++++++++++++++++------------------------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/src/raw.jl b/src/raw.jl index d3abda5..f0ae362 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -28,10 +28,10 @@ x === RawDocument ? Document : nothing -struct XMLSpaceContext - preserve_space::Vector{Bool} # Stack to track xml:space state -end -XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving +#struct XMLSpaceContext +# preserve_space::Vector{Bool} # Stack to track xml:space state +#end +#XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving #-----------------------------------------------------------------------------# Raw """ @@ -69,9 +69,9 @@ struct Raw pos::Int len::Int data::Vector{UInt8} - ctx::XMLSpaceContext + ctx::Vector{Bool} # Context for xml:space (Vector so mutable) end -Raw(data::Vector{UInt8}, ctx=XMLSpaceContext()) = Raw(RawDocument, 0, 0, 0, data, ctx) +Raw(data::Vector{UInt8}, ctx=[false]) = Raw(RawDocument, 0, 0, 0, data, ctx) Base.read(filename::String, ::Type{Raw}) = isfile(filename) ? @@ -169,25 +169,17 @@ function attributes(o::Raw) i = name_start(o.data, i) i = name_stop(o.data, i) out=get_attributes(o.data, i + 1, o.pos + o.len) - if !isnothing(out) && haskey(out, "xml:space") + if o.type === RawElementOpen && !isnothing(out) && haskey(out, "xml:space") # If xml:space attribute is present, we need to preserve whitespace if out["xml:space"] == "preserve" - push!(o.ctx.preserve_space, true) + o.ctx[1]= true elseif out["xml:space"] == "default" - push!(o.ctx.preserve_space, false) + o.ctx[1] = false else error("Invalid value for xml:space attribute: $(out["xml:space"]). Must be 'preserve' or 'default'.") end end out - - elseif o.type === RawText - if length(o.ctx.preserve_space)>0 - push!(o.ctx.preserve_space, o.ctx.preserve_space[end]) - else - push!(o.ctx.preserve_space, false) - end - nothing elseif o.type === RawDeclaration get_attributes(o.data, o.pos + 6, o.pos + o.len) else @@ -225,11 +217,7 @@ function children(o::Raw) out = Raw[] for item in xml_nodes(o) if item.depth == depth + 1 - if length(item.ctx.preserve_space) > 0 - item.ctx.preserve_space[1] = o.ctx.preserve_space[end] # inherit the context - else - push!(item.ctx.preserve_space, false) - end + item.ctx[1] = o.ctx[1] # inherit the context o.type==RawElementOpen && attributes(item) push!(out, item) end @@ -284,20 +272,19 @@ function next(o::Raw) ctx = o.ctx k = findnext(!isspace, data, i) if (isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0) - length(ctx.preserve_space)>0 && pop!(ctx.preserve_space) # pop the previous context return nothing end - i = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? k : i + i = (ctx[1]) ? i : k j = i + 1 c = Char(o.data[k]) d = Char(o.data[k+1]) if type === RawElementOpen || type === RawDocument depth += 1 end - if c !== '<' || type === RawElementOpen && d === '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end]) + if c !== '<' || type === RawElementOpen && d === '/' && (ctx[1]) type = RawText j = findnext(==(UInt8('<')), data, i) - 1 - j = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? findprev(!isspace, data, j) : j # preserving whitespace if needed + j = (ctx[1]) ? j : findprev(!isspace, data, j) # preserving whitespace if needed else i=k j=k+1 @@ -357,18 +344,17 @@ function prev(o::Raw) j = o.pos - 1 k = findprev(!isspace, data, j) if isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0 - length(ctx.preserve_space)>0 && pop!(ctx.preserve_space) # pop the previous context return Raw(data, ctx) # RawDocument end - j = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? k : j + j = (ctx[1]) ? j : k c = Char(o.data[j]) d = Char(data[findprev(==(UInt8('<')), data, j)+1]) i = j - 1 next_type = type - if c !== '>' || type === RawElementClose && d !== '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end]) # text or empty whitespace + if c !== '>' || type === RawElementClose && d !== '/' && (ctx[1]) # text or empty whitespace type = RawText i=findprev(==(UInt8('>')), data, j) + 1 - i = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? findprev(!isspace, data, i) : i # If preserving whitespace, retain leading and trailing whitespace + i = (ctx[1]) ? i : findprev(!isspace, data, i) # If preserving whitespace, retain leading and trailing whitespace else j=k i=k-1