diff --git a/src/raw.jl b/src/raw.jl
index b2b44f5..f0ae362 100644
--- a/src/raw.jl
+++ b/src/raw.jl
@@ -28,6 +28,11 @@
x === RawDocument ? Document :
nothing
+#struct XMLSpaceContext
+# preserve_space::Vector{Bool} # Stack to track xml:space state
+#end
+#XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving
+
#-----------------------------------------------------------------------------# Raw
"""
Raw(filename::String)
@@ -64,8 +69,10 @@ struct Raw
pos::Int
len::Int
data::Vector{UInt8}
+ ctx::Vector{Bool} # Context for xml:space (Vector so mutable)
end
-Raw(data::Vector{UInt8}) = Raw(RawDocument, 0, 0, 0, data)
+Raw(data::Vector{UInt8}, ctx=[false]) = Raw(RawDocument, 0, 0, 0, data, ctx)
+
Base.read(filename::String, ::Type{Raw}) = isfile(filename) ?
Raw(Mmap.mmap(filename)) :
@@ -117,7 +124,7 @@ end
# starting at position i, return attributes up until the next '>' or '?' (DTD)
function get_attributes(data, i, j)
i = name_start(data, i)
- i > j && return nothing
+ (isnothing(j) || isnothing(i) || i > j) && return nothing
out = OrderedDict{String, String}()
while !isnothing(i) && i < j
key, i = get_name(data, i)
@@ -161,7 +168,18 @@ function attributes(o::Raw)
i = o.pos
i = name_start(o.data, i)
i = name_stop(o.data, i)
- get_attributes(o.data, i + 1, o.pos + o.len)
+ out=get_attributes(o.data, i + 1, o.pos + o.len)
+ if o.type === RawElementOpen && !isnothing(out) && haskey(out, "xml:space")
+ # If xml:space attribute is present, we need to preserve whitespace
+ if out["xml:space"] == "preserve"
+ o.ctx[1]= true
+ elseif out["xml:space"] == "default"
+ o.ctx[1] = false
+ else
+ error("Invalid value for xml:space attribute: $(out["xml:space"]). Must be 'preserve' or 'default'.")
+ end
+ end
+ out
elseif o.type === RawDeclaration
get_attributes(o.data, o.pos + 6, o.pos + o.len)
else
@@ -198,7 +216,11 @@ function children(o::Raw)
depth = o.depth
out = Raw[]
for item in xml_nodes(o)
- item.depth == depth + 1 && push!(out, item)
+ if item.depth == depth + 1
+ item.ctx[1] = o.ctx[1] # inherit the context
+ o.type==RawElementOpen && attributes(item)
+ push!(out, item)
+ end
item.depth == depth && break
o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root
end
@@ -247,55 +269,64 @@ function next(o::Raw)
depth = o.depth
data = o.data
type = o.type
- i = findnext(!isspace, data, i) # skip insignificant whitespace
- isnothing(i) && return nothing
+ ctx = o.ctx
+ k = findnext(!isspace, data, i)
+ if (isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0)
+ return nothing
+ end
+ i = (ctx[1]) ? i : k
+ j = i + 1
+ c = Char(o.data[k])
+ d = Char(o.data[k+1])
if type === RawElementOpen || type === RawDocument
depth += 1
end
- c = Char(o.data[i])
- j = i + 1
- if c !== '<'
+ if c !== '<' || type === RawElementOpen && d === '/' && (ctx[1])
type = RawText
j = findnext(==(UInt8('<')), data, i) - 1
- j = findprev(!isspace, data, j) # "rstrip"
- elseif c === '<'
- c2 = Char(o.data[i + 1])
- if c2 === '!'
- c3 = Char(o.data[i + 2])
- if c3 === '-'
- type = RawComment
- j = findnext(Vector{UInt8}("-->"), data, i)[end]
- elseif c3 === '['
- type = RawCData
- j = findnext(Vector{UInt8}("]]>"), data, i)[end]
- elseif c3 === 'D' || c3 == 'd'
- type = RawDTD
- j = findnext(==(UInt8('>')), data, i)
- while sum(==(UInt8('>')), data[i:j]) != sum(==(UInt8('<')), data[i:j])
- j = findnext(==(UInt8('>')), data, j + 1)
+ j = (ctx[1]) ? j : findprev(!isspace, data, j) # preserving whitespace if needed
+ else
+ i=k
+ j=k+1
+ if c === '<'
+ c2 = Char(o.data[i + 1])
+ if c2 === '!'
+ c3 = Char(o.data[i + 2])
+ if c3 === '-'
+ type = RawComment
+ j = findnext(Vector{UInt8}("-->"), data, i)[end]
+ elseif c3 === '['
+ type = RawCData
+ j = findnext(Vector{UInt8}("]]>"), data, i)[end]
+ elseif c3 === 'D' || c3 == 'd'
+ type = RawDTD
+ j = findnext(==(UInt8('>')), data, i)
+ while sum(==(UInt8('>')), data[k:j]) != sum(==(UInt8('<')), data[i:j])
+ j = findnext(==(UInt8('>')), data, j + 1)
+ end
end
- end
- elseif c2 === '?'
- if get_name(data, i + 2)[1] == "xml"
- type = RawDeclaration
- else
- type = RawProcessingInstruction
- end
- j = findnext(Vector{UInt8}("?>"), data, i)[end]
- elseif c2 === '/'
- type = RawElementClose
- depth -= 1
- j = findnext(==(UInt8('>')), data, i)
- else
- j = findnext(==(UInt8('>')), data, i)
- if data[j-1] === UInt8('/')
- type = RawElementSelfClosed
+ elseif c2 === '?'
+ if get_name(data, i + 2)[1] == "xml"
+ type = RawDeclaration
+ else
+ type = RawProcessingInstruction
+ end
+ j = findnext(Vector{UInt8}("?>"), data, i)[end]
+ elseif c2 === '/'
+ type = RawElementClose
+ depth -= 1
+ j = findnext(==(UInt8('>')), data, i)
else
- type = RawElementOpen
+ j = findnext(==(UInt8('>')), data, i)
+ if data[j-1] === UInt8('/')
+ type = RawElementSelfClosed
+ else
+ type = RawElementOpen
+ end
end
end
end
- return Raw(type, depth, i, j - i, data)
+ return Raw(type, depth, i, j - i, data, ctx)
end
#-----------------------------------------------------------------------------# prev Raw
@@ -308,52 +339,61 @@ function prev(o::Raw)
depth = o.depth
data = o.data
type = o.type
+ ctx = o.ctx
type === RawDocument && return nothing
j = o.pos - 1
- j = findprev(!isspace, data, j) # skip insignificant whitespace
- isnothing(j) && return Raw(data) # RawDocument
+ k = findprev(!isspace, data, j)
+ if isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0
+ return Raw(data, ctx) # RawDocument
+ end
+ j = (ctx[1]) ? j : k
c = Char(o.data[j])
+ d = Char(data[findprev(==(UInt8('<')), data, j)+1])
i = j - 1
next_type = type
- if c !== '>' # text
+ if c !== '>' || type === RawElementClose && d !== '/' && (ctx[1]) # text or empty whitespace
type = RawText
- i = findprev(==(UInt8('>')), data, j) + 1
- i = findnext(!isspace, data, i) # "lstrip"
- elseif c === '>'
- c2 = Char(o.data[j - 1])
- if c2 === '-'
- type = RawComment
- i = findprev(Vector{UInt8}("<--"), data, j)[1]
- elseif c2 === ']'
- type = RawCData
- i = findprev(Vector{UInt8}("')), data, j) + 1
+ i = (ctx[1]) ? i : findprev(!isspace, data, i) # If preserving whitespace, retain leading and trailing whitespace
+ else
+ j=k
+ i=k-1
+ if c === '>'
+ c2 = Char(o.data[j - 1])
+ if c2 === '-'
+ type = RawComment
+ i = findprev(Vector{UInt8}("<--"), data, j)[1]
+ elseif c2 === ']'
+ type = RawCData
+ i = findprev(Vector{UInt8}(".")
+ end
+ end
else
- i = findprev(==(UInt8('<')), data, j)
- char = Char(data[i+1])
- if char === '/'
- type = RawElementClose
- elseif char === '!'
- type = DTD
- elseif isletter(char) || char === '_'
- type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen
- else
- error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.")
- end
+ error("Unreachable reached in XML.prev")
end
- else
- error("Unreachable reached in XML.prev")
end
if type !== RawElementOpen && next_type === RawElementClose
depth += 1
elseif type == RawElementOpen && next_type !== RawElementClose
depth -= 1
end
- return Raw(type, depth, i, j - i, data)
+ return Raw(type, depth, i, j - i, data, ctx)
end
diff --git a/test/runtests.jl b/test/runtests.jl
index d41924b..2418c54 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -174,6 +174,114 @@ end
end
end
+#-----------------------------------------------------------------------------# Preserve whitespace
+@testset "xml:space" begin
+ @testset "Basic xml:space functionality" begin
+
+ # Test 1: xml:space="preserve" should preserve entirely empty whitespace
+ xml1 = """ """
+ doc1 = parse(XML.Node, xml1)
+ text_content = XML.value(doc1[1][1][1])
+ @test text_content == " "
+
+ # Test 2: xml:space="preserve" should preserve leading and trailing whitespace
+ xml2 = """ leading and trailing spaces """
+ doc2 = parse(XML.Node, xml2)
+ text_content = XML.value(doc2[1][1][1])
+ @test text_content == " leading and trailing spaces "
+
+ # Test 3: Without xml:space, entirely empty whitespace should create a self closing node
+ xml3 = """ """
+ doc3 = XML.parse(XML.Node, xml3)
+ text_content = XML.write(doc3[1][1])
+ @test text_content == ""
+
+ # Test 4: Without xml:space, whitespace should be normalized
+ xml4 = """ gets normalized """
+ doc4 = XML.parse(XML.Node, xml4)
+ text_content = XML.value(doc4[1][1][1])
+ @test text_content == "gets normalized"
+
+ # Test 5: xml:space="default" should normalize even with preserve_xml_space=true
+ xml5 = """ gets normalized """
+ doc5 = XML.parse(XML.Node, xml5)
+ text_content = XML.value(doc5[1][1][1])
+ @test text_content == "gets normalized"
+ end
+
+ @testset "xml:space inheritance" begin
+ # Test 6: Children inherit parent's xml:space="preserve"
+ xml6 = """
+ parent text
+ child text
+
+ """
+ doc6 = XML.parse(XML.Node, xml6)
+ # Both parent and child should preserve whitespace
+ @test contains(XML.value(doc6[1][1][1]), "parent text \n")
+ @test XML.value(doc6[1][1][2][1]) == " child text "
+
+ # Test 7: xml:space="default" overrides parent's "preserve"
+ xml7 = """
+ normalized despite parent
+ """
+ doc7 = XML.parse(XML.Node, xml7)
+ @test XML.value(doc7[1][1][1]) == "normalized despite parent"
+ end
+
+ @testset "Nesting scenarios" begin
+ # Test 8: Multiple levels of xml:space changes
+ xml8 = """
+ preserved
+ normalized
+ preserved again
+
+
+ """
+ doc8 = XML.parse(XML.Node, xml8)
+
+ # level1 should preserve (inherits from root)
+ level1_text = XML.value(doc8[1][1][1])
+ @test level1_text == " preserved \n "
+
+ # level2 should normalize (explicit xml:space="default")
+ level2_text = XML.value(doc8[1][1][2][1])
+ @test level2_text == "normalized"
+
+ # level3 should preserve (explicit xml:space="preserve")
+ level3_text = XML.value(doc8[1][1][2][2][1])
+ @test level3_text == " preserved again "
+
+ # Test 9: repeated multiple levels of xml:space changes
+ xml9 = """
+ preserved
+ normalized
+ preserved again
+
+
+ preserved b
+ normalized b
+ preserved again b
+
+
+ """
+ doc9 = XML.parse(XML.Node, xml9)
+
+ # level1b should preserve (inherits from root)
+ level1b_text = XML.value(doc9[1][2][1])
+ @test level1b_text == " preserved b \n "
+
+ # level2 should normalize (explicit xml:space="default")
+ level2b_text = XML.value(doc9[1][2][2][1])
+ @test level2b_text == "normalized b"
+
+ # level3 should preserve (explicit xml:space="preserve")
+ level3b_text = XML.value(doc9[1][2][2][2][1])
+ @test level3b_text == " preserved again b "
+
+ end
+end
+
#-----------------------------------------------------------------------------# roundtrip
@testset "read/write/read roundtrip" begin
for path in all_files