Skip to content

Commit 44c5e5f

Browse files
committed
all tests green
1 parent 30fcf1c commit 44c5e5f

File tree

4 files changed

+181
-92
lines changed

4 files changed

+181
-92
lines changed

<!-- comment -->

Whitespace-only changes.

benchmarks/suite.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ file = download("http://schemas.opengis.net/kml/2.2.0/ogckml22.xsd")
77
filename = tempname()
88

99
#-----------------------------------------------------------------------------# read
10-
@info "XML read" @benchmark Node($file)
11-
@info "XML lazy" @benchmark XML.LazyNode($file)
10+
@info "XML lazy read" @benchmark XML.RawData($file)
11+
1212
@info "EzXML read" @benchmark EzXML.readxml($file)

src/XML.jl

Lines changed: 108 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,15 @@ unescape(x::AbstractString) = replace(x, reverse.(escape_chars)...)
2828
#-----------------------------------------------------------------------------# RawDataType
2929
"""
3030
RawDataType:
31-
RAW_TEXT # text
32-
RAW_COMMENT # <!-- ... -->
33-
RAW_CDATA # <![CDATA[...]]>
34-
RAW_DECLARATION # <?xml attributes... ?>
35-
RAW_DTD # <!DOCTYPE ...>
36-
RAW_ELEMENT_OPEN # <NAME attributes... >
37-
RAW_ELEMENT_CLOSE # </NAME>
38-
RAW_ELEMENT_SELF_CLOSED # <NAME attributes... />
39-
RAW_DOCUMENT # Something to initilize with (not really used)
31+
- RAW_TEXT # text
32+
- RAW_COMMENT # <!-- ... -->
33+
- RAW_CDATA # <![CDATA[...]]>
34+
- RAW_DECLARATION # <?xml attributes... ?>
35+
- RAW_DTD # <!DOCTYPE ...>
36+
- RAW_ELEMENT_OPEN # <NAME attributes... >
37+
- RAW_ELEMENT_CLOSE # </NAME>
38+
- RAW_ELEMENT_SELF_CLOSED # <NAME attributes... />
39+
- RAW_DOCUMENT # Something to initilize with (not really used)
4040
"""
4141
@enum(RawDataType, RAW_DOCUMENT, RAW_TEXT, RAW_COMMENT, RAW_CDATA, RAW_DECLARATION, RAW_DTD,
4242
RAW_ELEMENT_OPEN, RAW_ELEMENT_CLOSE, RAW_ELEMENT_SELF_CLOSED)
@@ -75,6 +75,7 @@ Useful functions:
7575
- prev(o::RawData) --> RawData of the previous chunk (or `nothing`).
7676
- tag(o::RawData) --> String of the tag name (or `nothing`).
7777
- attributes(o::RawData) --> OrderedDict{String, String} of the attributes (or `nothing`).
78+
- value(o::RawData) --> String of the value (or `nothing`).
7879
- children(o::RawData) --> Vector{RawData} of the children (or `nothing`).
7980
- parent(o::RawData) --> RawData of the parent (or `nothing`)
8081
"""
@@ -85,10 +86,11 @@ struct RawData
8586
len::Int
8687
data::Vector{UInt8}
8788
end
88-
function RawData(filename::String)
89-
data = Mmap.mmap(filename)
90-
RawData(RAW_DOCUMENT, 0, 0, 0, data)
91-
end
89+
RawData(data::Vector{UInt8}) = RawData(RAW_DOCUMENT, 0, 0, 0, data)
90+
RawData(filename::String) = RawData(Mmap.mmap(filename))
91+
92+
parse(x::AbstractString) = RawData(Vector{UInt8}(x))
93+
9294
function Base.show(io::IO, o::RawData)
9395
print(io, o.depth, ": ", o.type, " (pos=", o.pos, ", len=", o.len, ")")
9496
o.len > 0 && printstyled(io, ": ", String(o.data[o.pos:o.pos + o.len]); color=:light_green)
@@ -109,17 +111,19 @@ function Base.iterate(o::RawData, state=o)
109111
return isnothing(n) ? nothing : (n, n)
110112
end
111113

114+
is_node(o::RawData) = o.type !== RAW_ELEMENT_CLOSE
115+
nodes(o::RawData) = Iterators.Filter(is_node, o)
112116

113117
#-----------------------------------------------------------------------------# get_name
114118
# find the start/stop of a name given a starting position `i`
115119
_name_start(data, i) = findnext(x -> isletter(Char(x)) || Char(x) === '_', data, i)
116120
is_name_char(x) = (c = Char(x); isletter(c) || isdigit(c) || c "._-:")
117121
function _name_stop(data, i)
118122
i = findnext(!is_name_char, data, i)
119-
isnothing(i) ? nothing : i
123+
isnothing(i) ? length(data) : i
120124
end
121125

122-
# start at position i, return name and position after name
126+
# starting at position i, return name and position after name
123127
function get_name(data, i)
124128
i = _name_start(data, i)
125129
j = _name_stop(data, i)
@@ -128,20 +132,21 @@ function get_name(data, i)
128132
end
129133

130134
#-----------------------------------------------------------------------------# get_attributes
131-
function get_attributes(data)
135+
# starting at position i, return attributes up until the next '>' or '?' (DTD)
136+
function get_attributes(data, i)
137+
j = findnext(x -> x == UInt8('>') || x == UInt8('?'), data, i)
138+
i = _name_start(data, i)
139+
i > j && return nothing
132140
out = OrderedDict{String, String}()
133-
i = 1
134-
while !isnothing(i)
135-
# get key
141+
while !isnothing(i) && i < j
136142
key, i = get_name(data, i)
137143
# get quotechar the value is wrapped in (either ' or ")
138144
i = findnext(x -> Char(x) === '"' || Char(x) === ''', data, i)
139145
quotechar = data[i]
140-
j = findnext(==(quotechar), data, i + 1)
141-
# get value and set it
142-
value = String(data[i+1:j-1])
146+
i2 = findnext(==(quotechar), data, i + 1)
147+
value = String(data[i+1:i2-1])
143148
out[key] = value
144-
i = _name_start(data, j + 1)
149+
i = _name_start(data, i2)
145150
end
146151
return out
147152
end
@@ -156,10 +161,12 @@ end
156161

157162
function attributes(o::RawData)
158163
if o.type === RAW_ELEMENT_OPEN || o.type === RAW_ELEMENT_SELF_CLOSED
159-
_, i = get_name(o.data, o.pos + 1)
164+
i = o.pos
165+
i = _name_start(o.data, i)
166+
i = _name_stop(o.data, i)
160167
get_attributes(o.data, i)
161168
elseif o.type === RAW_DECLARATION
162-
get_attributes(@view(o.data[o.pos + 6:o.pos + o.len - 1]))
169+
get_attributes(o.data, o.pos + 6)
163170
else
164171
nothing
165172
end
@@ -202,7 +209,13 @@ function parent(o::RawData)
202209
return p
203210
end
204211

212+
nodetype(o::RawData) = nodetype(o.type)
205213

214+
# sometimes I'd rather use e.g. `nodetype = _nodetype(o)`
215+
_nodetype = nodetype
216+
_tag = tag
217+
_attributes = attributes
218+
_value = value
206219

207220
#-----------------------------------------------------------------------------# next RawData
208221
notspace(x::UInt8) = !isspace(Char(x))
@@ -220,12 +233,12 @@ function next(o::RawData)
220233
if c !== '<'
221234
type = RAW_TEXT
222235
j = findnext(==(UInt8('<')), data, i) - 1
236+
j = findprev(notspace, data, j) # "rstrip"
223237
elseif c === '<'
224238
c2 = Char(o.data[i + 1])
225239
if c2 === '!'
226240
c3 = Char(o.data[i + 2])
227241
if c3 === '-'
228-
i += 1
229242
type = RAW_COMMENT
230243
j = findnext(Vector{UInt8}("-->"), data, i)[end]
231244
elseif c3 === '['
@@ -268,6 +281,7 @@ function prev(o::RawData)
268281
if c !== '>' # text
269282
type = RAW_TEXT
270283
i = findprev(==(UInt8('>')), data, j) + 1
284+
i = findnext(notspace, data, i) # "lstrip"
271285
elseif c === '>'
272286
c2 = Char(o.data[j - 1])
273287
if c2 === '-'
@@ -305,10 +319,10 @@ end
305319

306320

307321
#-----------------------------------------------------------------------------# Lazy
308-
# struct LazyNode
309-
# data::RawData
310-
# end
311-
# LazyNode(filename::AbstractString) = LazyNode(RawData(filename))
322+
struct LazyNode
323+
data::RawData
324+
end
325+
LazyNode(filename::AbstractString) = LazyNode(RawData(filename))
312326

313327

314328

@@ -349,41 +363,49 @@ end
349363

350364

351365
#-----------------------------------------------------------------------------# RowNode
352-
# struct RowNode
353-
# nodetype::NodeType
354-
# tag::Union{String, Nothing}
355-
# attributes::Union{OrderedDict{String, String}, Nothing}
356-
# value::Union{String, Nothing}
357-
# depth::Int
358-
# end
366+
struct RowNode
367+
nodetype::NodeType
368+
tag::Union{String, Nothing}
369+
attributes::Union{OrderedDict{String, String}, Nothing}
370+
value::Union{String, Nothing}
371+
data::Union{RawData, Nothing}
372+
end
373+
function RowNode(data::RawData)
374+
nodetype = _nodetype(data.type)
375+
tag = _tag(data)
376+
attributes = _attributes(data)
377+
value = _value(data)
378+
RowNode(nodetype, tag, attributes, value, data)
379+
end
380+
359381
# function RowNode(t::RawData)
360382
# (; type, pos, len, depth) = t
361-
# pos === 0 && return RowNode(DOCUMENT_NODE, nothing, nothing, nothing, 0)
383+
# pos === 0 && return RowNode(DOCUMENT, nothing, nothing, nothing, 0)
362384
# data = view(t.data, pos:pos+len)
363385
# @views if type === RAW_TEXT # text
364-
# return RowNode(TEXT_NODE, nothing, nothing, unescape(String(data), depth))
386+
# return RowNode(TEXT, nothing, nothing, unescape(String(data), data))
365387
# elseif type === RAW_COMMENT # <!-- ... -->
366-
# return RowNode(COMMENT_NODE, nothing, nothing, String(data[4:end-3]), depth)
388+
# return RowNode(COMMENT, nothing, nothing, String(data[4:end-3]), data)
367389
# elseif type === RAW_CDATA # <![CDATA[...]]>
368-
# return RowNode(CDATA_NODE, nothing, nothing, String(data[10:end-3]), depth)
390+
# return RowNode(CDATA, nothing, nothing, String(data[10:end-3]), data)
369391
# elseif type === RAW_DECLARATION # <?xml attributes... ?>
370392
# rng = 7:length(data) - 2
371393
# attributes = get_attributes(data[rng])
372-
# return RowNode(DECLARATION_NODE, nothing, attributes, nothing, depth)
394+
# return RowNode(DECLARATION, nothing, attributes, nothing, data)
373395
# elseif type === RAW_DTD # <!DOCTYPE ...>
374-
# return RowNode(DTD_NODE, nothing, nothing, String(data[10:end-1]), depth)
396+
# return RowNode(DTD, nothing, nothing, String(data[10:end-1]), data)
375397
# elseif type === RAW_ELEMENT_OPEN # <NAME attributes... >
376398
# tag, i = get_name(data, 2)
377399
# i = findnext(x -> isletter(Char(x)) || x === UInt8('_'), data, i)
378400
# attributes = isnothing(i) ? nothing : get_attributes(data[i:end-1])
379-
# return RowNode(ELEMENT_NODE, tag, attributes, nothing, depth)
401+
# return RowNode(ELEMENT, tag, attributes, nothing, data)
380402
# elseif type === RAW_ELEMENT_CLOSE # </NAME>
381403
# return nothing
382404
# elseif type === RAW_ELEMENT_SELF_CLOSED # <NAME attributes... />
383405
# tag, i = get_name(data, 2)
384406
# i = findnext(x -> isletter(Char(x)) || x === UInt8('_'), data, i)
385407
# attributes = isnothing(i) ? nothing : get_attributes(data[i:end-2])
386-
# return RowNode(ELEMENT_NODE, tag, attributes, nothing, depth)
408+
# return RowNode(ELEMENT, tag, attributes, nothing, data)
387409
# else
388410
# error("Unhandled token: $tok.")
389411
# end
@@ -469,48 +491,48 @@ end
469491
# end
470492

471493

472-
# #-----------------------------------------------------------------------------# Node
473-
# Base.@kwdef struct Node
474-
# nodetype::NodeType
475-
# tag::Union{Nothing, String} = nothing
476-
# attributes::Union{Nothing, OrderedDict{String, String}} = nothing
477-
# value::Union{Nothing, String} = nothing
478-
# children::Union{Nothing, Vector{Node}} = nothing
479-
# depth::Int = 0
480-
# end
481-
# function Node((;nodetype, tag, attributes, value, children, depth)::Node; kw...)
482-
# Node(; nodetype, tag, attributes, value, children, depth, kw...)
483-
# end
484-
# function (o::Node)(children...)
485-
# isempty(children) && return o
486-
# out = sizehint!(Node[], length(children))
487-
# foreach(children) do x
488-
# if x isa Node
489-
# push!(out, Node(x; depth=o.depth + 1))
490-
# else
491-
# push!(out, Node(nodetype=TEXT_NODE, value=string(x), depth=o.depth + 1))
492-
# end
493-
# end
494+
#-----------------------------------------------------------------------------# Node
495+
Base.@kwdef struct Node
496+
nodetype::NodeType
497+
tag::Union{Nothing, String} = nothing
498+
attributes::Union{Nothing, OrderedDict{String, String}} = nothing
499+
value::Union{Nothing, String} = nothing
500+
children::Union{Nothing, Vector{Node}} = nothing
501+
depth::Int = 0
502+
end
503+
function Node((;nodetype, tag, attributes, value, children, depth)::Node; kw...)
504+
Node(; nodetype, tag, attributes, value, children, depth, kw...)
505+
end
506+
function (o::Node)(children...)
507+
isempty(children) && return o
508+
out = sizehint!(Node[], length(children))
509+
foreach(children) do x
510+
if x isa Node
511+
push!(out, Node(x; depth=o.depth + 1))
512+
else
513+
push!(out, Node(nodetype=TEXT_NODE, value=string(x), depth=o.depth + 1))
514+
end
515+
end
494516

495-
# Node(o; children=out)
496-
# end
517+
Node(o; children=out)
518+
end
497519

498520
# function Node((; depth, nodetype, tag, attributes, value)::RowNode)
499521
# Node(; depth, nodetype, tag, attributes, value)
500522
# end
501523
# Node(o::TokenData) = Node(RowNode(o))
502524

503-
# function Base.:(==)(a::Node, b::Node)
504-
# a.nodetype == b.nodetype &&
505-
# a.tag == b.tag &&
506-
# a.attributes == b.attributes &&
507-
# a.value == b.value && (
508-
# (isnothing(a.children) && isnothing(b.children)) ||
509-
# (isnothing(a.children) && isempty(b.children)) ||
510-
# (isempty(a.children) && isnothing(b.children)) ||
511-
# all(ai == bi for (ai,bi) in zip(a.children, b.children))
512-
# )
513-
# end
525+
function Base.:(==)(a::Node, b::Node)
526+
a.nodetype == b.nodetype &&
527+
a.tag == b.tag &&
528+
a.attributes == b.attributes &&
529+
a.value == b.value && (
530+
(isnothing(a.children) && isnothing(b.children)) ||
531+
(isnothing(a.children) && isempty(b.children)) ||
532+
(isempty(a.children) && isnothing(b.children)) ||
533+
all(ai == bi for (ai,bi) in zip(a.children, b.children))
534+
)
535+
end
514536

515537
# # function element(nodetype::NodeType, tag = nothing; attributes...)
516538
# # attributes = isempty(attributes) ?
@@ -519,15 +541,15 @@ end
519541
# # Node(; nodetype, tag, attributes)
520542
# # end
521543

522-
# Base.getindex(o::Node, i::Integer) = o.children[i]
523-
# Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
524-
# Base.lastindex(o::Node) = lastindex(o.children)
544+
Base.getindex(o::Node, i::Integer) = o.children[i]
545+
Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
546+
Base.lastindex(o::Node) = lastindex(o.children)
525547

526-
# Base.push!(a::Node, b::Node) = push!(a.children, b)
548+
Base.push!(a::Node, b::Node) = push!(a.children, b)
527549

528-
# AbstractTrees.children(o::Node) = isnothing(o.children) ? [] : o.children
550+
AbstractTrees.children(o::Node) = isnothing(o.children) ? [] : o.children
529551

530-
# Base.show(io::IO, o::Node) = _show_node(io, o)
552+
Base.show(io::IO, o::Node) = _show_node(io, o)
531553

532554
# #-----------------------------------------------------------------------------# read
533555
# read(filename::AbstractString) = Node(Tokens(filename))

0 commit comments

Comments
 (0)