Skip to content

Commit 7d452d1

Browse files
committed
work on DTD parsing, make escaping idempotent
1 parent 6605d93 commit 7d452d1

File tree

7 files changed

+149
-15
lines changed

7 files changed

+149
-15
lines changed

Project.toml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,3 @@ Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
88

99
[compat]
1010
julia = "1.7"
11-
12-
[extras]
13-
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
14-
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
15-
16-
[targets]
17-
test = ["Test", "Downloads"]

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,3 +193,10 @@ Platform Info:
193193
EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 165.21
194194
EzXML.readxml ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 239.197
195195
```
196+
197+
<br>
198+
<br>
199+
200+
# Possible Gotchas
201+
202+
XML.jl doesn't escape special characters (`<`, `>`, `&`, `"`, and `'` ) for you, but it provides `XML.escape(::String)` and `XML.unescape(::String)` utility functions.

src/XML.jl

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,24 @@ export
1111
parent, depth, next, prev
1212

1313
#-----------------------------------------------------------------------------# escape/unescape
14-
# only used by Text nodes during `XML.write`
14+
# TODO: be smarter :
15+
# 1. to avoid escaping entities e.g. `&entity;`
16+
# 2. to avoid re-escaping already escaped entities e.g. `&amp;`
1517
const escape_chars = ('&' => "&amp;", '<' => "&lt;", '>' => "&gt;", "'" => "&apos;", '"' => "&quot;")
16-
escape(x::AbstractString) = replace(x, escape_chars...)
18+
1719
unescape(x::AbstractString) = replace(x, reverse.(escape_chars)...)
1820

21+
# requires special handling of `&` to avoid double-escaping
22+
function escape(x::String)
23+
s = replace(x,
24+
r"&(?!(?:amp|lt|gt|quot|apos);)" => "&amp;",
25+
'<' => "&lt;", '>' => "&gt;",
26+
'"' => "&quot;",
27+
''' => "&apos;"
28+
)
29+
return s
30+
end
31+
1932
#-----------------------------------------------------------------------------# NodeType
2033
"""
2134
NodeType:
@@ -42,8 +55,9 @@ NodeTypes can be used to construct XML.Nodes:
4255
@enum(NodeType, CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text)
4356

4457

45-
#-----------------------------------------------------------------------------# raw
58+
#-----------------------------------------------------------------------------# includes
4659
include("raw.jl")
60+
include("dtd.jl")
4761

4862
abstract type AbstractXMLNode end
4963

@@ -103,7 +117,7 @@ function prev(o::LazyNode)
103117
n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n)
104118
end
105119

106-
#-----------------------------------------------------------------------------?de
120+
#-----------------------------------------------------------------------------# Node
107121
"""
108122
Node(nodetype, tag, attributes, value, children)
109123
Node(node::Node; kw...) # copy node with keyword overrides
@@ -142,6 +156,17 @@ function Node(node::LazyNode)
142156
Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c))
143157
end
144158

159+
# NOT in-place for Text Nodes
160+
function escape!(o::Node, warn::Bool=true)
161+
if o.nodetype == Text
162+
warn && @warn "escape!() called on a Text Node creates a new node."
163+
return Text(escape(o.value))
164+
end
165+
isnothing(o.children) && return o
166+
map!(x -> escape!(x, false), o.children, o.children)
167+
o
168+
end
169+
145170

146171
Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw))
147172
Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw))
@@ -314,7 +339,7 @@ function write(io::IO, x; indentsize::Int=2, depth::Union{Missing,Int}=depth(x))
314339
padding = indent ^ max(0, depth - 1)
315340
print(io, padding)
316341
if nodetype === Text
317-
print(io, escape(value))
342+
print(io, value)
318343
elseif nodetype === Element
319344
print(io, '<', tag)
320345
_print_attrs(io, x)

src/dtd.jl

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#-----------------------------------------------------------------------------# DeclaredElement
2+
struct DeclaredElement
3+
name::String
4+
content::String # "ANY", "EMPTY", or "(children...)"
5+
function DeclaredElement(name, content)
6+
content in ("ANY", "EMPTY") || (content[1] == '(' && content[end] == ')') ||
7+
error("DeclaredElement `content` must be 'ANY', 'EMPTY', or '(children...)'. Got $content.")
8+
new(name, content)
9+
end
10+
end
11+
Base.show(io::IO, o::DeclaredElement) = print(io, "<!ELEMENT ", o.name, " ", o.content, ">")
12+
13+
function get_declared_elements(data::Vector{UInt8})
14+
i = findnext(Vector{UInt8}("<!ELEMENT"), data, 1)[end]
15+
out = DeclaredElement[]
16+
while !isnothing(i)
17+
name, i = get_name(data, i + 1)
18+
i = findnext(!isspace, data, i)
19+
if data[i] == UInt8('(')
20+
j = findnext(==(UInt8(')')), data, i + 1)
21+
content = String(data[i:j])
22+
else
23+
content, i = get_name(data, i)
24+
end
25+
push!(out, DeclaredElement(name, content))
26+
fn = findnext(Vector{UInt8}("<!ELEMENT"), data, i)
27+
i = isnothing(fn) ? nothing : fn[end]
28+
end
29+
return out
30+
end
31+
32+
#-----------------------------------------------------------------------------# DeclaredAttribute
33+
struct DeclaredAttribute
34+
element_name::String
35+
attribute_name::String
36+
attribute_type::String
37+
attribute_value::String
38+
end
39+
Base.show(io::IO, o::DeclaredAttribute) = print(io, "<!ATTLIST ", o.element_name, " ", o.attribute_name, " ", o.attribute_type, " ", o.attribute_value, ">")
40+
41+
function get_declared_attributes(data)
42+
[]
43+
end
44+
45+
#-----------------------------------------------------------------------------# DeclaredEntity
46+
struct DeclaredEntity
47+
name::String
48+
value::String
49+
end
50+
Base.show(io::IO, o::DeclaredEntity) = print(io, "<!ENTITY ", o.name, " ", o.value, ">")
51+
52+
function get_declared_entities(data)
53+
[]
54+
end
55+
56+
#-----------------------------------------------------------------------------# DTDBody
57+
struct DTDBody
58+
root::String
59+
elements::Vector{DeclaredElement}
60+
attributes::Vector{DeclaredAttribute}
61+
entities::Vector{DeclaredEntity}
62+
end
63+
64+
function Base.show(io::IO, o::DTDBody)
65+
println(io, "DTDBody(root=\"", o.root)
66+
println(io, " • DeclaredElements")
67+
foreach(x -> println(io, " ", x), o.elements)
68+
println(io, " • DeclaredAttributes")
69+
println(io, " • DeclaredEntities")
70+
end
71+
72+
73+
74+
function DTDBody(data::Vector{UInt8})
75+
start = "<!DOCTYPE"
76+
data[1:length(start)] == Vector{UInt8}(start) || error("DTD must start with `<!DOCTYPE`.")
77+
i = length(start) + 1
78+
root, i = get_name(data, i)
79+
80+
i = findnext(==(UInt8('[')), data, i)
81+
isnothing(i) && return DTDBody(root, [], [], [])
82+
83+
elements = get_declared_elements(data)
84+
attributes = get_declared_attributes(data)
85+
entities = get_declared_entities(data)
86+
return DTDBody(root, elements, attributes, entities)
87+
end
88+
89+
90+
Base.read(filename::String, ::Type{DTDBody}) = DTDBody(read(filename))
91+
Base.read(io::IO, ::Type{DTDBody}) = Raw(read(io))
92+
Base.parse(s::AbstractString, ::Type{DTDBody}) = DTDBody(Vector{UInt8}(s))
93+
Base.parse(::Type{DTDBody}, s::AbstractString) = parse(s, DTDBody)

src/raw.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ xml_nodes(o::Raw) = Iterators.Filter(is_node, o)
102102

103103
#-----------------------------------------------------------------------------# get_name
104104
is_name_start_char(x::UInt8) = x in UInt8('A'):UInt8('Z') || x in UInt8('a'):UInt8('z') || x == UInt8('_')
105-
is_name_char(x::UInt8) = is_name_start_char(x) || x in UInt8('0'):UInt8('9') || x == UInt8('-') || x == UInt8('.')
105+
is_name_char(x::UInt8) = is_name_start_char(x) || x in UInt8('0'):UInt8('9') || x == UInt8('-') || x == UInt8('.') || x == UInt8(':')
106106

107107
name_start(data, i) = findnext(is_name_start_char, data, i)
108108
name_stop(data, i) = findnext(!is_name_char, data, i) - 1
@@ -176,7 +176,7 @@ Return the value of `Text`, `CData`, `Comment`, or `DTD` nodes.
176176
"""
177177
function value(o::Raw)
178178
if o.type === RawText
179-
unescape(String(o))
179+
String(o)
180180
elseif o.type === RawCData
181181
String(view(o.data, o.pos + length("<![CData[") : o.pos + o.len - 3))
182182
elseif o.type === RawComment

test/Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[deps]
2+
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
3+
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
4+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

test/runtests.jl

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
using XML
2-
using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text
2+
using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text, escape, unescape
33
using Downloads: download
44
using Test
5+
import AbstractTrees
6+
7+
AbstractTrees.children(x::Node) = children(x)
58

69
#-----------------------------------------------------------------------------# files
710
xml_spec = download("http://www.w3.org/2001/xml.xsd")
@@ -16,6 +19,15 @@ all_files = [
1619
"example.kml" => example_kml
1720
]
1821

22+
#-----------------------------------------------------------------------------# utils
23+
@testset "utils" begin
24+
s = "This > string < has & some \" special ' characters"
25+
@test escape(s) == "This &gt; string &lt; has &amp; some &quot; special &apos; characters"
26+
@test escape(escape(s)) == escape(s)
27+
@test s == unescape(escape(s))
28+
@test s == unescape(unescape(escape(s)))
29+
end
30+
1931
#-----------------------------------------------------------------------------# Raw
2032
@testset "Raw tag/attributes/value" begin
2133
examples = [

0 commit comments

Comments
 (0)