@@ -28,15 +28,15 @@ unescape(x::AbstractString) = replace(x, reverse.(escape_chars)...)
28
28
# -----------------------------------------------------------------------------# RawDataType
29
29
"""
30
30
RawDataType:
31
- RAW_TEXT # text
32
- RAW_COMMENT # <!-- ... -->
33
- RAW_CDATA # <![CDATA[...]]>
34
- RAW_DECLARATION # <?xml attributes... ?>
35
- RAW_DTD # <!DOCTYPE ...>
36
- RAW_ELEMENT_OPEN # <NAME attributes... >
37
- RAW_ELEMENT_CLOSE # </NAME>
38
- RAW_ELEMENT_SELF_CLOSED # <NAME attributes... />
39
- RAW_DOCUMENT # Something to initilize with (not really used)
31
+ - RAW_TEXT # text
32
+ - RAW_COMMENT # <!-- ... -->
33
+ - RAW_CDATA # <![CDATA[...]]>
34
+ - RAW_DECLARATION # <?xml attributes... ?>
35
+ - RAW_DTD # <!DOCTYPE ...>
36
+ - RAW_ELEMENT_OPEN # <NAME attributes... >
37
+ - RAW_ELEMENT_CLOSE # </NAME>
38
+ - RAW_ELEMENT_SELF_CLOSED # <NAME attributes... />
39
+ - RAW_DOCUMENT # Something to initilize with (not really used)
40
40
"""
41
41
@enum (RawDataType, RAW_DOCUMENT, RAW_TEXT, RAW_COMMENT, RAW_CDATA, RAW_DECLARATION, RAW_DTD,
42
42
RAW_ELEMENT_OPEN, RAW_ELEMENT_CLOSE, RAW_ELEMENT_SELF_CLOSED)
@@ -75,6 +75,7 @@ Useful functions:
75
75
- prev(o::RawData) --> RawData of the previous chunk (or `nothing`).
76
76
- tag(o::RawData) --> String of the tag name (or `nothing`).
77
77
- attributes(o::RawData) --> OrderedDict{String, String} of the attributes (or `nothing`).
78
+ - value(o::RawData) --> String of the value (or `nothing`).
78
79
- children(o::RawData) --> Vector{RawData} of the children (or `nothing`).
79
80
- parent(o::RawData) --> RawData of the parent (or `nothing`)
80
81
"""
@@ -85,10 +86,11 @@ struct RawData
85
86
len:: Int
86
87
data:: Vector{UInt8}
87
88
end
88
- function RawData (filename:: String )
89
- data = Mmap. mmap (filename)
90
- RawData (RAW_DOCUMENT, 0 , 0 , 0 , data)
91
- end
89
+ RawData (data:: Vector{UInt8} ) = RawData (RAW_DOCUMENT, 0 , 0 , 0 , data)
90
+ RawData (filename:: String ) = RawData (Mmap. mmap (filename))
91
+
92
+ parse (x:: AbstractString ) = RawData (Vector {UInt8} (x))
93
+
92
94
function Base. show (io:: IO , o:: RawData )
93
95
print (io, o. depth, " : " , o. type, " (pos=" , o. pos, " , len=" , o. len, " )" )
94
96
o. len > 0 && printstyled (io, " : " , String (o. data[o. pos: o. pos + o. len]); color= :light_green )
@@ -109,17 +111,19 @@ function Base.iterate(o::RawData, state=o)
109
111
return isnothing (n) ? nothing : (n, n)
110
112
end
111
113
114
+ is_node (o:: RawData ) = o. type != = RAW_ELEMENT_CLOSE
115
+ nodes (o:: RawData ) = Iterators. Filter (is_node, o)
112
116
113
117
# -----------------------------------------------------------------------------# get_name
114
118
# find the start/stop of a name given a starting position `i`
115
119
_name_start (data, i) = findnext (x -> isletter (Char (x)) || Char (x) === ' _' , data, i)
116
120
is_name_char (x) = (c = Char (x); isletter (c) || isdigit (c) || c ∈ " ._-:" )
117
121
function _name_stop (data, i)
118
122
i = findnext (! is_name_char, data, i)
119
- isnothing (i) ? nothing : i
123
+ isnothing (i) ? length (data) : i
120
124
end
121
125
122
- # start at position i, return name and position after name
126
+ # starting at position i, return name and position after name
123
127
function get_name (data, i)
124
128
i = _name_start (data, i)
125
129
j = _name_stop (data, i)
@@ -128,20 +132,21 @@ function get_name(data, i)
128
132
end
129
133
130
134
# -----------------------------------------------------------------------------# get_attributes
131
- function get_attributes (data)
135
+ # starting at position i, return attributes up until the next '>' or '?' (DTD)
136
+ function get_attributes (data, i)
137
+ j = findnext (x -> x == UInt8 (' >' ) || x == UInt8 (' ?' ), data, i)
138
+ i = _name_start (data, i)
139
+ i > j && return nothing
132
140
out = OrderedDict {String, String} ()
133
- i = 1
134
- while ! isnothing (i)
135
- # get key
141
+ while ! isnothing (i) && i < j
136
142
key, i = get_name (data, i)
137
143
# get quotechar the value is wrapped in (either ' or ")
138
144
i = findnext (x -> Char (x) === ' "' || Char (x) === ' '' , data, i)
139
145
quotechar = data[i]
140
- j = findnext (== (quotechar), data, i + 1 )
141
- # get value and set it
142
- value = String (data[i+ 1 : j- 1 ])
146
+ i2 = findnext (== (quotechar), data, i + 1 )
147
+ value = String (data[i+ 1 : i2- 1 ])
143
148
out[key] = value
144
- i = _name_start (data, j + 1 )
149
+ i = _name_start (data, i2 )
145
150
end
146
151
return out
147
152
end
@@ -156,10 +161,12 @@ end
156
161
157
162
function attributes (o:: RawData )
158
163
if o. type === RAW_ELEMENT_OPEN || o. type === RAW_ELEMENT_SELF_CLOSED
159
- _, i = get_name (o. data, o. pos + 1 )
164
+ i = o. pos
165
+ i = _name_start (o. data, i)
166
+ i = _name_stop (o. data, i)
160
167
get_attributes (o. data, i)
161
168
elseif o. type === RAW_DECLARATION
162
- get_attributes (@view ( o. data[o . pos + 6 : o. pos + o . len - 1 ]) )
169
+ get_attributes (o. data, o. pos + 6 )
163
170
else
164
171
nothing
165
172
end
@@ -202,7 +209,13 @@ function parent(o::RawData)
202
209
return p
203
210
end
204
211
212
+ nodetype (o:: RawData ) = nodetype (o. type)
205
213
214
+ # sometimes I'd rather use e.g. `nodetype = _nodetype(o)`
215
+ _nodetype = nodetype
216
+ _tag = tag
217
+ _attributes = attributes
218
+ _value = value
206
219
207
220
# -----------------------------------------------------------------------------# next RawData
208
221
notspace (x:: UInt8 ) = ! isspace (Char (x))
@@ -220,12 +233,12 @@ function next(o::RawData)
220
233
if c != = ' <'
221
234
type = RAW_TEXT
222
235
j = findnext (== (UInt8 (' <' )), data, i) - 1
236
+ j = findprev (notspace, data, j) # "rstrip"
223
237
elseif c === ' <'
224
238
c2 = Char (o. data[i + 1 ])
225
239
if c2 === ' !'
226
240
c3 = Char (o. data[i + 2 ])
227
241
if c3 === ' -'
228
- i += 1
229
242
type = RAW_COMMENT
230
243
j = findnext (Vector {UInt8} (" -->" ), data, i)[end ]
231
244
elseif c3 === ' ['
@@ -268,6 +281,7 @@ function prev(o::RawData)
268
281
if c != = ' >' # text
269
282
type = RAW_TEXT
270
283
i = findprev (== (UInt8 (' >' )), data, j) + 1
284
+ i = findnext (notspace, data, i) # "lstrip"
271
285
elseif c === ' >'
272
286
c2 = Char (o. data[j - 1 ])
273
287
if c2 === ' -'
@@ -305,10 +319,10 @@ end
305
319
306
320
307
321
# -----------------------------------------------------------------------------# Lazy
308
- # struct LazyNode
309
- # data::RawData
310
- # end
311
- # LazyNode(filename::AbstractString) = LazyNode(RawData(filename))
322
+ struct LazyNode
323
+ data:: RawData
324
+ end
325
+ LazyNode (filename:: AbstractString ) = LazyNode (RawData (filename))
312
326
313
327
314
328
@@ -349,41 +363,49 @@ end
349
363
350
364
351
365
# -----------------------------------------------------------------------------# RowNode
352
- # struct RowNode
353
- # nodetype::NodeType
354
- # tag::Union{String, Nothing}
355
- # attributes::Union{OrderedDict{String, String}, Nothing}
356
- # value::Union{String, Nothing}
357
- # depth::Int
358
- # end
366
+ struct RowNode
367
+ nodetype:: NodeType
368
+ tag:: Union{String, Nothing}
369
+ attributes:: Union{OrderedDict{String, String}, Nothing}
370
+ value:: Union{String, Nothing}
371
+ data:: Union{RawData, Nothing}
372
+ end
373
+ function RowNode (data:: RawData )
374
+ nodetype = _nodetype (data. type)
375
+ tag = _tag (data)
376
+ attributes = _attributes (data)
377
+ value = _value (data)
378
+ RowNode (nodetype, tag, attributes, value, data)
379
+ end
380
+
359
381
# function RowNode(t::RawData)
360
382
# (; type, pos, len, depth) = t
361
- # pos === 0 && return RowNode(DOCUMENT_NODE , nothing, nothing, nothing, 0)
383
+ # pos === 0 && return RowNode(DOCUMENT , nothing, nothing, nothing, 0)
362
384
# data = view(t.data, pos:pos+len)
363
385
# @views if type === RAW_TEXT # text
364
- # return RowNode(TEXT_NODE , nothing, nothing, unescape(String(data), depth ))
386
+ # return RowNode(TEXT , nothing, nothing, unescape(String(data), data ))
365
387
# elseif type === RAW_COMMENT # <!-- ... -->
366
- # return RowNode(COMMENT_NODE , nothing, nothing, String(data[4:end-3]), depth )
388
+ # return RowNode(COMMENT , nothing, nothing, String(data[4:end-3]), data )
367
389
# elseif type === RAW_CDATA # <![CDATA[...]]>
368
- # return RowNode(CDATA_NODE , nothing, nothing, String(data[10:end-3]), depth )
390
+ # return RowNode(CDATA , nothing, nothing, String(data[10:end-3]), data )
369
391
# elseif type === RAW_DECLARATION # <?xml attributes... ?>
370
392
# rng = 7:length(data) - 2
371
393
# attributes = get_attributes(data[rng])
372
- # return RowNode(DECLARATION_NODE , nothing, attributes, nothing, depth )
394
+ # return RowNode(DECLARATION , nothing, attributes, nothing, data )
373
395
# elseif type === RAW_DTD # <!DOCTYPE ...>
374
- # return RowNode(DTD_NODE , nothing, nothing, String(data[10:end-1]), depth )
396
+ # return RowNode(DTD , nothing, nothing, String(data[10:end-1]), data )
375
397
# elseif type === RAW_ELEMENT_OPEN # <NAME attributes... >
376
398
# tag, i = get_name(data, 2)
377
399
# i = findnext(x -> isletter(Char(x)) || x === UInt8('_'), data, i)
378
400
# attributes = isnothing(i) ? nothing : get_attributes(data[i:end-1])
379
- # return RowNode(ELEMENT_NODE , tag, attributes, nothing, depth )
401
+ # return RowNode(ELEMENT , tag, attributes, nothing, data )
380
402
# elseif type === RAW_ELEMENT_CLOSE # </NAME>
381
403
# return nothing
382
404
# elseif type === RAW_ELEMENT_SELF_CLOSED # <NAME attributes... />
383
405
# tag, i = get_name(data, 2)
384
406
# i = findnext(x -> isletter(Char(x)) || x === UInt8('_'), data, i)
385
407
# attributes = isnothing(i) ? nothing : get_attributes(data[i:end-2])
386
- # return RowNode(ELEMENT_NODE , tag, attributes, nothing, depth )
408
+ # return RowNode(ELEMENT , tag, attributes, nothing, data )
387
409
# else
388
410
# error("Unhandled token: $tok.")
389
411
# end
@@ -469,48 +491,48 @@ end
469
491
# end
470
492
471
493
472
- # # -----------------------------------------------------------------------------# Node
473
- # Base.@kwdef struct Node
474
- # nodetype::NodeType
475
- # tag::Union{Nothing, String} = nothing
476
- # attributes::Union{Nothing, OrderedDict{String, String}} = nothing
477
- # value::Union{Nothing, String} = nothing
478
- # children::Union{Nothing, Vector{Node}} = nothing
479
- # depth::Int = 0
480
- # end
481
- # function Node((;nodetype, tag, attributes, value, children, depth)::Node; kw...)
482
- # Node(; nodetype, tag, attributes, value, children, depth, kw...)
483
- # end
484
- # function (o::Node)(children...)
485
- # isempty(children) && return o
486
- # out = sizehint!(Node[], length(children))
487
- # foreach(children) do x
488
- # if x isa Node
489
- # push!(out, Node(x; depth=o.depth + 1))
490
- # else
491
- # push!(out, Node(nodetype=TEXT_NODE, value=string(x), depth=o.depth + 1))
492
- # end
493
- # end
494
+ # -----------------------------------------------------------------------------# Node
495
+ Base. @kwdef struct Node
496
+ nodetype:: NodeType
497
+ tag:: Union{Nothing, String} = nothing
498
+ attributes:: Union{Nothing, OrderedDict{String, String}} = nothing
499
+ value:: Union{Nothing, String} = nothing
500
+ children:: Union{Nothing, Vector{Node}} = nothing
501
+ depth:: Int = 0
502
+ end
503
+ function Node ((;nodetype, tag, attributes, value, children, depth):: Node ; kw... )
504
+ Node (; nodetype, tag, attributes, value, children, depth, kw... )
505
+ end
506
+ function (o:: Node )(children... )
507
+ isempty (children) && return o
508
+ out = sizehint! (Node[], length (children))
509
+ foreach (children) do x
510
+ if x isa Node
511
+ push! (out, Node (x; depth= o. depth + 1 ))
512
+ else
513
+ push! (out, Node (nodetype= TEXT_NODE, value= string (x), depth= o. depth + 1 ))
514
+ end
515
+ end
494
516
495
- # Node(o; children=out)
496
- # end
517
+ Node (o; children= out)
518
+ end
497
519
498
520
# function Node((; depth, nodetype, tag, attributes, value)::RowNode)
499
521
# Node(; depth, nodetype, tag, attributes, value)
500
522
# end
501
523
# Node(o::TokenData) = Node(RowNode(o))
502
524
503
- # function Base.:(==)(a::Node, b::Node)
504
- # a.nodetype == b.nodetype &&
505
- # a.tag == b.tag &&
506
- # a.attributes == b.attributes &&
507
- # a.value == b.value && (
508
- # (isnothing(a.children) && isnothing(b.children)) ||
509
- # (isnothing(a.children) && isempty(b.children)) ||
510
- # (isempty(a.children) && isnothing(b.children)) ||
511
- # all(ai == bi for (ai,bi) in zip(a.children, b.children))
512
- # )
513
- # end
525
+ function Base.:(== )(a:: Node , b:: Node )
526
+ a. nodetype == b. nodetype &&
527
+ a. tag == b. tag &&
528
+ a. attributes == b. attributes &&
529
+ a. value == b. value && (
530
+ (isnothing (a. children) && isnothing (b. children)) ||
531
+ (isnothing (a. children) && isempty (b. children)) ||
532
+ (isempty (a. children) && isnothing (b. children)) ||
533
+ all (ai == bi for (ai,bi) in zip (a. children, b. children))
534
+ )
535
+ end
514
536
515
537
# # function element(nodetype::NodeType, tag = nothing; attributes...)
516
538
# # attributes = isempty(attributes) ?
@@ -519,15 +541,15 @@ end
519
541
# # Node(; nodetype, tag, attributes)
520
542
# # end
521
543
522
- # Base.getindex(o::Node, i::Integer) = o.children[i]
523
- # Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
524
- # Base.lastindex(o::Node) = lastindex(o.children)
544
+ Base. getindex (o:: Node , i:: Integer ) = o. children[i]
545
+ Base. setindex! (o:: Node , val, i:: Integer ) = o. children[i] = Node (val)
546
+ Base. lastindex (o:: Node ) = lastindex (o. children)
525
547
526
- # Base.push!(a::Node, b::Node) = push!(a.children, b)
548
+ Base. push! (a:: Node , b:: Node ) = push! (a. children, b)
527
549
528
- # AbstractTrees.children(o::Node) = isnothing(o.children) ? [] : o.children
550
+ AbstractTrees. children (o:: Node ) = isnothing (o. children) ? [] : o. children
529
551
530
- # Base.show(io::IO, o::Node) = _show_node(io, o)
552
+ Base. show (io:: IO , o:: Node ) = _show_node (io, o)
531
553
532
554
# #-----------------------------------------------------------------------------# read
533
555
# read(filename::AbstractString) = Node(Tokens(filename))
0 commit comments