Skip to content

Commit caf8902

Browse files
authored
Merge pull request #3 from TimG1964/Normalize_newlines
Undo normalize newlines
2 parents f721e29 + b3ac812 commit caf8902

File tree

2 files changed

+0
-87
lines changed

2 files changed

+0
-87
lines changed

src/raw.jl

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ end
7070
function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false])
7171
needle = Vector{UInt8}("xml:space")
7272
has_xml_space = findfirst(needle, data) !== nothing
73-
data=normalize_newlines(data)
7473
return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space)
7574
end
7675
function Raw(data::Vector{UInt8}, has_xml_space::Bool, ctx::Vector{Bool}=Bool[false])
@@ -101,40 +100,6 @@ Base.read(io::IO, ::Type{Raw}) = Raw(read(io))
101100

102101
Base.parse(x::AbstractString, ::Type{Raw}) = Raw(Vector{UInt8}(x))
103102

104-
105-
"""
106-
normalize_newlines(bytes::Vector{UInt8}) -> Vector{UInt8}
107-
108-
Implements XML 1.1 §2.11 line-end normalization:
109-
- CR (0x0D) alone → LF (0x0A)
110-
- CR LF pair → LF
111-
- NEL (U+0085) → LF
112-
- LS (U+2028) → LF
113-
"""
114-
function normalize_newlines(bytes::Vector{UInt8})
115-
n = length(bytes)
116-
out = Vector{UInt8}(undef, n)
117-
outlen = 0
118-
i = 1
119-
while i <= n
120-
@inbounds b = bytes[i]
121-
if b == 0x0D
122-
outlen += 1; out[outlen] = 0x0A
123-
i += (i < n && (bytes[i+1] == 0x0A || bytes[i+1] == 0x85)) ? 2 : 1
124-
elseif b == 0xC2 && i < n && bytes[i+1] == 0x85
125-
outlen += 1; out[outlen] = 0x0A
126-
i += 2
127-
elseif b == 0xE2 && i+2 <= n && bytes[i+1] == 0x80 && bytes[i+2] == 0xA8
128-
outlen += 1; out[outlen] = 0x0A
129-
i += 3
130-
else
131-
outlen += 1; out[outlen] = b
132-
i += 1
133-
end
134-
end
135-
return resize!(out, outlen)
136-
end
137-
138103
# Mostly for debugging
139104
Base.peek(o::Raw, n::Int) = String(view(o.data[o.pos+o.len+1:min(end, o.pos + o.len + n + 1)]))
140105

test/runtests.jl

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -579,58 +579,6 @@ end
579579

580580
end
581581

582-
#-----------------------------------------------------------------------------# Normalize_newlines
583-
# Helper to make writing tests easier
584-
to_bytes(s) = Vector{UInt8}(s)
585-
from_bytes(b) = String(b)
586-
587-
@testset "normalize_newlines" begin
588-
# 1. Lone CR -> LF
589-
@test XML.normalize_newlines(to_bytes("a\rb")) == to_bytes("a\nb")
590-
591-
# 2. CRLF -> LF
592-
@test XML.normalize_newlines(to_bytes("a\r\nb")) == to_bytes("a\nb")
593-
594-
# 3. CR NEL (0x85) -> LF
595-
@test XML.normalize_newlines(UInt8[0x61, 0x0D, 0x85, 0x62]) == to_bytes("a\nb")
596-
597-
# 4. NEL (U+0085) UTF-8 form 0xC2 0x85 -> LF
598-
@test XML.normalize_newlines(UInt8[0x61, 0xC2, 0x85, 0x62]) == to_bytes("a\nb")
599-
600-
# 5. LINE SEPARATOR (U+2028) UTF-8 form 0xE2 0x80 0xA8 -> LF
601-
@test XML.normalize_newlines(UInt8[0x61, 0xE2, 0x80, 0xA8, 0x62]) == to_bytes("a\nb")
602-
603-
# 6. Mixed newline types in one string
604-
mixed = UInt8[0x61, 0x0D, 0x0A, 0x62, 0xC2, 0x85, 0x63, 0xE2, 0x80, 0xA8, 0x64, 0x0D, 0x65]
605-
expected = to_bytes("a\nb\nc\nd\ne")
606-
@test XML.normalize_newlines(mixed) == expected
607-
608-
# 7. Consecutive CRs
609-
@test XML.normalize_newlines(to_bytes("a\r\rb")) == to_bytes("a\n\nb")
610-
611-
# 8. Leading/trailing newlines
612-
@test XML.normalize_newlines(to_bytes("\rabc\r")) == to_bytes("\nabc\n")
613-
614-
# 9. Empty input
615-
@test XML.normalize_newlines(UInt8[]) == UInt8[]
616-
617-
# 10. No newline characters
618-
@test XML.normalize_newlines(to_bytes("abcdef")) == to_bytes("abcdef")
619-
620-
# 11. Unicode safety: multi-byte chars around newlines
621-
s = "α\r\nβ" # α = 0xCE 0xB1, β = 0xCE 0xB2
622-
@test XML.normalize_newlines(to_bytes(s)) == to_bytes("α\nβ")
623-
624-
# 12. Boundary case: CR at end of buffer
625-
@test XML.normalize_newlines(UInt8[0x61, 0x0D]) == to_bytes("a\n")
626-
627-
# 13. Boundary case: 0xC2 at end (incomplete UTF-8 NEL)
628-
@test XML.normalize_newlines(UInt8[0x61, 0xC2]) == UInt8[0x61, 0xC2]
629-
630-
# 14. Boundary case: 0xE2 0x80 at end (incomplete LINE SEPARATOR)
631-
@test XML.normalize_newlines(UInt8[0x61, 0xE2, 0x80]) == UInt8[0x61, 0xE2, 0x80]
632-
end
633-
634582
#-----------------------------------------------------------------------------# roundtrip
635583
@testset "read/write/read roundtrip" begin
636584
for path in all_files

0 commit comments

Comments
 (0)