Skip to content

Commit f721e29

Browse files
authored
Merge pull request #2 from TimG1964/Normalize_newlines
Normalize newlines Implements XML 1.1 §2.11 line-end normalization: - CR (0x0D) alone → LF (0x0A) - CR LF pair → LF - NEL (U+0085) → LF - LS (U+2028) → LF
2 parents 932b1b5 + 50aa9aa commit f721e29

File tree

2 files changed

+132
-43
lines changed

2 files changed

+132
-43
lines changed

src/raw.jl

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ end
7070
function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false])
7171
needle = Vector{UInt8}("xml:space")
7272
has_xml_space = findfirst(needle, data) !== nothing
73+
data=normalize_newlines(data)
7374
return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space)
7475
end
7576
function Raw(data::Vector{UInt8}, has_xml_space::Bool, ctx::Vector{Bool}=Bool[false])
@@ -100,6 +101,40 @@ Base.read(io::IO, ::Type{Raw}) = Raw(read(io))
100101

101102
Base.parse(x::AbstractString, ::Type{Raw}) = Raw(Vector{UInt8}(x))
102103

104+
105+
"""
106+
normalize_newlines(bytes::Vector{UInt8}) -> Vector{UInt8}
107+
108+
Implements XML 1.1 §2.11 line-end normalization:
109+
- CR (0x0D) alone → LF (0x0A)
110+
- CR LF pair → LF
111+
- NEL (U+0085) → LF
112+
- LS (U+2028) → LF
113+
"""
114+
function normalize_newlines(bytes::Vector{UInt8})
115+
n = length(bytes)
116+
out = Vector{UInt8}(undef, n)
117+
outlen = 0
118+
i = 1
119+
while i <= n
120+
@inbounds b = bytes[i]
121+
if b == 0x0D
122+
outlen += 1; out[outlen] = 0x0A
123+
i += (i < n && (bytes[i+1] == 0x0A || bytes[i+1] == 0x85)) ? 2 : 1
124+
elseif b == 0xC2 && i < n && bytes[i+1] == 0x85
125+
outlen += 1; out[outlen] = 0x0A
126+
i += 2
127+
elseif b == 0xE2 && i+2 <= n && bytes[i+1] == 0x80 && bytes[i+2] == 0xA8
128+
outlen += 1; out[outlen] = 0x0A
129+
i += 3
130+
else
131+
outlen += 1; out[outlen] = b
132+
i += 1
133+
end
134+
end
135+
return resize!(out, outlen)
136+
end
137+
103138
# Mostly for debugging
104139
Base.peek(o::Raw, n::Int) = String(view(o.data[o.pos+o.len+1:min(end, o.pos + o.len + n + 1)]))
105140

@@ -146,7 +181,6 @@ function get_attributes(data, i, j)
146181
out = OrderedDict{String,String}()
147182
while !isnothing(i) && i < j
148183
key, i = get_name(data, i)
149-
#haskey(out, key) && error("Duplicate attribute name found: $key") # would this be useful?
150184
# get quotechar the value is wrapped in (either ' or ")
151185
i = findnext(x -> x === UInt8('"') || x === UInt8('''), data, i + 1)
152186
quotechar = data[i]
@@ -566,3 +600,4 @@ function prev_no_xml_space(o::Raw) # same as v0.3.5
566600
end
567601
return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
568602
end
603+

test/runtests.jl

Lines changed: 96 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ end
116116
@test String(doc[end]) == "</catalog>"
117117

118118
@testset "next and prev" begin
119-
@test XML.prev(doc[1]) == data # can't use === here because prev returns a copy of ctx
120119
@test XML.prev(doc[1]) == data # can't use === here because prev returns a copy of ctx
121120
@test prev(data) === nothing
122121
@test XML.next(doc[end]) === nothing
@@ -429,19 +428,18 @@ end
429428
@test XML.value(d2[1][6][1]) == " after default gap "
430429
@test XML.value(d2[1][7]) == "\n"
431430
end
432-
433-
# @testset "XML whitespace vs Unicode whitespace" begin
434-
# nbsp = "\u00A0"
435-
# s = """<root>
436-
# <a> x\t\n </a>
437-
# <b>$(nbsp) y $(nbsp)</b>
438-
# <c xml:space="default">$(nbsp) z $(nbsp)</c>
439-
# </root>"""
440-
# d = XML.parse(XML.Node, s)
441-
# @test XML.value(d[1][1][1]) == "x"
442-
# @test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)"
443-
# @test XML.value(d[1][3][1]) == "$(nbsp) z $(nbsp)"
444-
# end
431+
@testset "XML whitespace vs Unicode whitespace" begin
432+
nbsp = "\u00A0"
433+
s = """<root>
434+
<a> x\t\n </a>
435+
<b>$(nbsp) y $(nbsp)</b>
436+
<c xml:space="default">$(nbsp) z $(nbsp)</c>
437+
</root>"""
438+
d = XML.parse(XML.Node, s)
439+
@test XML.value(d[1][1][1]) == "x"
440+
@test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)"
441+
@test XML.value(d[1][3][1]) == "$(nbsp) z $(nbsp)"
442+
end
445443

446444
@testset "CDATA/Comment/PI boundaries" begin
447445
s = """<root>
@@ -486,17 +484,21 @@ end
486484
@test XML.value(d[1][1]) == "a"
487485
end
488486

489-
# @testset "entities expanding to whitespace" begin
490-
# s = """<root>
491-
# <a> &#x20; a &#x0A; </a>
492-
# <b xml:space="preserve">&#x20; b &#x0A;</b>
493-
# <c>&#xA0;c&#xA0;</c>
494-
# </root>"""
495-
# d = XML.parse(XML.Node, s)
496-
# @test XML.value(d[1][1][1]) == "a"
497-
# @test XML.value(d[1][2][1]) == " b \n"
498-
# @test XML.value(d[1][3][1]) == "\u00A0c\u00A0"
499-
# end
487+
@testset "entities expanding to whitespace" begin
488+
chr1="\u0020"
489+
chr2="\u000A"
490+
chr3="\u00A0"
491+
492+
s = """<root>
493+
<a> $(chr1) a $(chr2) </a>
494+
<b xml:space="preserve">$(chr1) b $(chr2)</b>
495+
<c>$(chr3)c$(chr3)</c>
496+
</root>"""
497+
d = XML.parse(XML.Node, s)
498+
@test XML.value(d[1][1][1]) == "a"
499+
@test XML.value(d[1][2][1]) == " b \n"
500+
@test XML.value(d[1][3][1]) == "$(chr3)c$(chr3)"
501+
end
500502

501503
@testset "invalid values and placement" begin
502504
s_bad = """<root><x xml:space="weird"> t </x></root>"""
@@ -535,23 +537,22 @@ end
535537
@test reverse(back)[2:end] == toks[1:end-1]
536538
end
537539

538-
# @testset "write/read roundtrip extremes" begin
539-
# XML.write doesn't respect xml:space="preserve" in the current implementation so roundtrip isn't possible.
540-
# xml = """<root>
541-
# <p xml:space="preserve"> </p>
542-
# <q> </q>
543-
# <r xml:space="default"> r </r>
544-
# <s xml:space="preserve"> pre <t/> post </s>
545-
# </root>"""
546-
# n = XML.parse(XML.Node, xml)
547-
# io = IOBuffer(); XML.write(io, n)
548-
# n2 = XML.parse(XML.Node, String(take!(io)))
549-
# @test n == n2
550-
# @test XML.write(n2[1][1]) == "<p xml:space=\"preserve\"> </p>"
551-
# @test XML.write(n2[1][2]) == "<q/>"
552-
# @test XML.value(n2[1][3][1]) == "r"
553-
# @test XML.write(n2[1][4]) == "<s xml:space=\"preserve\"> pre <t/> post </s>"
554-
# end
540+
@testset "write/read roundtrip extremes" begin
541+
xml = """<root>
542+
<p xml:space="preserve"> </p>
543+
<q> </q>
544+
<r xml:space="default"> r </r>
545+
<s xml:space="preserve"> pre <t/> post </s>
546+
</root>"""
547+
n = XML.parse(XML.Node, xml)
548+
io = IOBuffer(); XML.write(io, n)
549+
n2 = XML.parse(XML.Node, String(take!(io)))
550+
@test n == n2
551+
@test XML.write(n2[1][1]) == "<p xml:space=\"preserve\"> </p>"
552+
@test XML.write(n2[1][2]) == "<q/>"
553+
@test XML.value(n2[1][3][1]) == "r"
554+
@test XML.write(n2[1][4]) == "<s xml:space=\"preserve\"> pre <t/> post </s>"
555+
end
555556

556557
@testset "self-closing/empty/whitespace-only children" begin
557558
s = """<root>
@@ -578,6 +579,58 @@ end
578579

579580
end
580581

582+
#-----------------------------------------------------------------------------# Normalize_newlines
583+
# Helper to make writing tests easier
584+
to_bytes(s) = Vector{UInt8}(s)
585+
from_bytes(b) = String(b)
586+
587+
@testset "normalize_newlines" begin
588+
# 1. Lone CR -> LF
589+
@test XML.normalize_newlines(to_bytes("a\rb")) == to_bytes("a\nb")
590+
591+
# 2. CRLF -> LF
592+
@test XML.normalize_newlines(to_bytes("a\r\nb")) == to_bytes("a\nb")
593+
594+
# 3. CR NEL (0x85) -> LF
595+
@test XML.normalize_newlines(UInt8[0x61, 0x0D, 0x85, 0x62]) == to_bytes("a\nb")
596+
597+
# 4. NEL (U+0085) UTF-8 form 0xC2 0x85 -> LF
598+
@test XML.normalize_newlines(UInt8[0x61, 0xC2, 0x85, 0x62]) == to_bytes("a\nb")
599+
600+
# 5. LINE SEPARATOR (U+2028) UTF-8 form 0xE2 0x80 0xA8 -> LF
601+
@test XML.normalize_newlines(UInt8[0x61, 0xE2, 0x80, 0xA8, 0x62]) == to_bytes("a\nb")
602+
603+
# 6. Mixed newline types in one string
604+
mixed = UInt8[0x61, 0x0D, 0x0A, 0x62, 0xC2, 0x85, 0x63, 0xE2, 0x80, 0xA8, 0x64, 0x0D, 0x65]
605+
expected = to_bytes("a\nb\nc\nd\ne")
606+
@test XML.normalize_newlines(mixed) == expected
607+
608+
# 7. Consecutive CRs
609+
@test XML.normalize_newlines(to_bytes("a\r\rb")) == to_bytes("a\n\nb")
610+
611+
# 8. Leading/trailing newlines
612+
@test XML.normalize_newlines(to_bytes("\rabc\r")) == to_bytes("\nabc\n")
613+
614+
# 9. Empty input
615+
@test XML.normalize_newlines(UInt8[]) == UInt8[]
616+
617+
# 10. No newline characters
618+
@test XML.normalize_newlines(to_bytes("abcdef")) == to_bytes("abcdef")
619+
620+
# 11. Unicode safety: multi-byte chars around newlines
621+
s = "α\r\nβ" # α = 0xCE 0xB1, β = 0xCE 0xB2
622+
@test XML.normalize_newlines(to_bytes(s)) == to_bytes("α\nβ")
623+
624+
# 12. Boundary case: CR at end of buffer
625+
@test XML.normalize_newlines(UInt8[0x61, 0x0D]) == to_bytes("a\n")
626+
627+
# 13. Boundary case: 0xC2 at end (incomplete UTF-8 NEL)
628+
@test XML.normalize_newlines(UInt8[0x61, 0xC2]) == UInt8[0x61, 0xC2]
629+
630+
# 14. Boundary case: 0xE2 0x80 at end (incomplete LINE SEPARATOR)
631+
@test XML.normalize_newlines(UInt8[0x61, 0xE2, 0x80]) == UInt8[0x61, 0xE2, 0x80]
632+
end
633+
581634
#-----------------------------------------------------------------------------# roundtrip
582635
@testset "read/write/read roundtrip" begin
583636
for path in all_files
@@ -642,3 +695,4 @@ end
642695
xyz = XML.Element("point"; kw...)
643696
@test collect(keys(attributes(xyz))) == string.(collect('a':'z'))
644697
end
698+

0 commit comments

Comments
 (0)