Skip to content

Commit e56c017

Browse files
committed
tables integration for Tokens and Rows
1 parent 24606ed commit e56c017

File tree

3 files changed

+11
-282
lines changed

3 files changed

+11
-282
lines changed

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
88
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
99
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
1010
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
11+
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1112

1213
[compat]
1314
AbstractTrees = "0.3, 0.4"

src/XML.jl

Lines changed: 10 additions & 270 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module XML
33
using OrderedCollections: OrderedDict
44
using Base: @kwdef, StringVector
55
using Mmap
6+
using Tables
67
# import AbstractTrees: print_tree, printnode, children
78

89
export Document, DTD, Declaration, Comment, CData, Element,
@@ -42,17 +43,20 @@ unescape(x::AbstractString) = replace(x, reverse.(escape_chars)...)
4243
struct Tokens
4344
filename::String
4445
data::Vector{UInt8}
45-
name_map::Dict{String, Symbol}
46-
Tokens(filename::String) = new(filename, Mmap.mmap(filename), Dict{String, Symbol}())
46+
Tokens(filename::String) = new(filename, Mmap.mmap(filename))
4747
end
48+
Tables.rows(o::Tokens) = o
49+
Tables.schema(o::Tokens) = Tables.Schema(fieldnames(TokenData), fieldtypes(TokenData))
4850

4951
struct TokenData
5052
tok::XMLToken
5153
depth::Int
54+
pos::Int
5255
data::typeof(view(Vector{UInt8}("example"), 1:2))
5356
end
5457
function Base.show(io::IO, o::TokenData)
55-
print(io, o.tok, '(', o.depth, "): ")
58+
print(io, o.tok)
59+
printstyled(io, " (depth=", o.depth, ", ", "pos=", o.pos, ") : "; color=:light_black)
5660
printstyled(io, String(copy(o.data)); color=:light_green)
5761
end
5862

@@ -107,7 +111,7 @@ function Base.iterate(o::Tokens, state = (1, 1))
107111
error("Unexpected character: $c")
108112
end
109113
tok === TOK_UNKNOWN && error("Token isn't identified: $(String(data[i:j]))")
110-
return TokenData(tok, depth, view(o.data, i:j)) => (j + 1, depth)
114+
return TokenData(tok, depth, i, view(o.data, i:j)) => (j + 1, depth)
111115
end
112116

113117

@@ -116,6 +120,8 @@ struct Rows
116120
tokens::Tokens
117121
end
118122
Rows(filename::String) = Rows(Tokens(filename))
123+
Tables.rows(o::Rows) = o
124+
Tables.schema(o::Rows) = Tables.Schema(fieldnames(RowNode), fieldtypes(RowNode))
119125

120126
struct RowNode
121127
depth::Int
@@ -160,7 +166,6 @@ function _print_attrs(io::IO, o)
160166
!isnothing(o.attributes) && printstyled(io, [" $k=\"$v\"" for (k,v) in o.attributes]...; color=:light_black)
161167
end
162168

163-
164169
Base.IteratorSize(::Type{Rows}) = Base.SizeUnknown()
165170
Base.eltype(::Type{Rows}) = RowNode
166171
Base.isdone(o::Rows, pos) = isdone(o.file, pos)
@@ -307,122 +312,13 @@ function _show_node(io, o)
307312
error("Unreachable reached")
308313
end
309314
end
310-
function _print_attrs(io::IO, o)
311-
!isnothing(o.attributes) && printstyled(io, [" $k=\"$v\"" for (k,v) in o.attributes]...; color=:light_black)
312-
end
313-
function _print_n_children(io::IO, o)
314-
hasfield(typeof(o), :children) && !isnothing(o.children) && printstyled(io, " (", length(o.children), " children)", color=:light_black)
315-
end
316315

317316
Base.getindex(o::Node, i::Integer) = o.children[i]
318317
Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
319318
Base.lastindex(o::Node) = lastindex(o.children)
320319

321320
Base.push!(a::Node, b::Node) = push!(a.children, b)
322321

323-
#-----------------------------------------------------------------------------# FileChunk
324-
# struct FileChunk
325-
# file::File
326-
# rng::UnitRange{Int}
327-
# nodetype::NodeType
328-
# end
329-
# data(o::FileChunk) = (data = view(o.file.data, o.rng), nodetype = o.nodetype)
330-
331-
# Base.IteratorSize(::Type{FileChunk}) = Base.SizeUnknown()
332-
# Base.eltype(::Type{FileChunk}) = typeof(())
333-
334-
335-
336-
# Base.IteratorSize(::Type{File}) = Base.SizeUnknown()
337-
# Base.eltype(::Type{File}) = Node
338-
339-
# # state = (position, depth)
340-
# function Base.iterate(o::File, state=(1, 0))
341-
# pos, depth = state
342-
# pos = findnext(x -> !isspace(Char(x)), o.data, pos)
343-
# isnothing(pos) && return nothing
344-
345-
# char = Char(o.data[pos])
346-
# isletter(char) && return get_text(o, pos, depth)
347-
348-
# char != '<' && error("Unexpected character: $char")
349-
# pos += 1
350-
# char = Char(o.data[pos])
351-
# if char === '/'
352-
# depth -= 1
353-
# pos = findnext(x -> x == UInt8('>'), o.data, pos)
354-
# return iterate(o, (pos + 1, depth))
355-
# end
356-
# isletter(char) && return get_element(o, pos, depth)
357-
# char === '?' && return get_declaration(o, pos + 3, depth)
358-
# char != '!' && error("Unexpected character: $char")
359-
360-
# pos += 1
361-
# char = Char(o.data[pos])
362-
# char === '-' && return get_comment(o, pos + 1, depth)
363-
# char === '[' && return get_cdata(o, pos + length("CDATA["), depth)
364-
# char === 'D' && return get_dtd(o, pos + length("OCTYPE"), depth)
365-
# end
366-
367-
# function get_text(o::File, pos, depth)
368-
# pos2 = findnext(x -> x == UInt8('<'), o.data, pos) - 1
369-
# return Node(TEXT_NODE; depth, content=unescape(String(o.data[pos:pos2]))) => (pos2 + 1, depth)
370-
# end
371-
# function get_element(o::File, pos, depth)
372-
# tag, pos = get_name(o, pos)
373-
# attributes, pos = get_attributes(o, pos)
374-
# pos = findnext(==(UInt8('>')), o.data, pos)
375-
# o.data[pos-1] !== UInt8('/') && (depth += 1)
376-
# return Node(ELEMENT_NODE; depth, tag, attributes) => (pos + 1, depth)
377-
# end
378-
# function get_declaration(o::File, pos, depth)
379-
# attributes, pos = get_attributes(o, pos)
380-
# return Node(DECLARATION_NODE; depth, attributes) => (pos, depth)
381-
# end
382-
# function get_comment(o::File, pos, depth)
383-
# a, b = extrema(findnext(Vector{UInt8}("-->"), o.data, pos))
384-
# content = o.data[pos:a-1]
385-
# return Node(COMMENT_NODE; depth, content) => (b + 1, depth)
386-
# end
387-
# function get_cdata(o::File, pos, depth)
388-
# a, b = extrema(findnext(Vector{UInt8}("]]>"), o.data, pos))
389-
# content = String(o.data[pos:a-1])
390-
# return Node(CDATA_NODE; depth, content) => (b + 1, depth)
391-
# end
392-
# function get_dtd(o::File, pos, depth)
393-
# pos2 = findnext(==(UInt8('>')), o.data, pos)
394-
# content = String(o.data[pos:pos2-1])
395-
# return Node(DTD_NODE; depth, content) => (pos2 + 1, depth)
396-
# end
397-
398-
# function get_name(o::File, pos)
399-
# pos2 = findnext(x -> !(isletter(Char(x)) || isdigit(Char(x)) || x ∉ Vector{UInt8}("._-:")), o.data, pos)
400-
# name = String(o.data[pos:pos2-1])
401-
# return name, pos2
402-
# end
403-
404-
# function get_attributes(o::File, pos)
405-
# out = OrderedDict{Symbol, String}()
406-
# pos2 = pos
407-
# while true
408-
# if isspace(Char(o.data[pos2]))
409-
# pos += 1
410-
# continue
411-
# end
412-
# o.data[pos] == UInt8('>') && break
413-
# key, pos = get_name(o, pos)
414-
# @info key, pos
415-
# pos = findnext(x -> Char(x) === '"' || Char(x) === ''', o.data, pos) + 1
416-
# quotechar = o.data[pos]
417-
# pos2 = findnext(==(quotechar), o.data, pos)
418-
# value = String(o.data[pos:pos2-1])
419-
# out[Symbol(key)] = value
420-
# pos = pos2 + 1
421-
# end
422-
# return out, pos2 + 1
423-
# end
424-
425-
426322
Base.read(filename::AbstractString, ::Type{Node}) = open(io -> read(io, Node), filename)
427323

428324
function Base.read(io::IO, ::Type{Node})
@@ -439,162 +335,6 @@ end
439335

440336
_with_children(o::Node) = isnothing(o.children) ? Node(o, children=Node[]) : o
441337

442-
#-----------------------------------------------------------------------------# StreamingIterator
443-
@kwdef struct StreamingIterator
444-
io::IO
445-
buf::IOBuffer = IOBuffer()
446-
debug::Bool = false
447-
end
448-
function StreamingIterator(io::IO; kw...)
449-
isreadable(io) || error("IO input to StreamingIterator is not readable.")
450-
StreamingIterator(; io, kw...)
451-
end
452-
453-
Base.eltype(::Type{<:StreamingIterator}) = Node
454-
Base.IteratorSize(::Type{<:StreamingIterator}) = Base.SizeUnknown()
455-
Base.isdone(itr::StreamingIterator, state...) = eof(itr.io)
456-
457-
function Base.read(o::StreamingIterator, x...)
458-
item = read(o.io, x...)
459-
write(o.buf, item)
460-
return item
461-
end
462-
Base.readuntil(o::StreamingIterator, x; keep=false) = readuntil(o.io, x; keep)
463-
Base.peek(o::StreamingIterator, x) = peek(o.io, x)
464-
skip_spaces(o::StreamingIterator) = skipchars(isspace, o.io)
465-
Base.skip(o::StreamingIterator, n::Integer) = skip(o.io, n)
466-
# Base.readeach(o::StreamingIterator, T) = readeach(o.io, T)
467-
Base.take!(o::StreamingIterator) = take!(o.buf)
468-
469-
# state = (index,depth)
470-
function Base.iterate(o::StreamingIterator, state = (0, 1))
471-
state[1] == 0 && seekstart(o.io)
472-
next, state2 = get_next(o, state)
473-
return isnothing(next) ? nothing : (next, state2)
474-
end
475-
476-
function get_next(o::StreamingIterator, state)
477-
skip_spaces(o)
478-
Base.isdone(o) && return (nothing, nothing)
479-
take!(o) # ensure buffer starts from scratch
480-
index, depth = state
481-
#---------------------------------# CASE 1: TEXT_NODE
482-
char = peek(o, Char)
483-
if char !== '<'
484-
char = read(o, Char)
485-
while true
486-
peek(o, Char) === '<' ? break : read(o, Char)
487-
end
488-
content = String(take!(o))
489-
return Node(TEXT_NODE; content, depth) => (index + 1, depth)
490-
elseif char === '<'
491-
skip(o, 1)
492-
else
493-
error("Expected a letter (text node) or '<'. Found: '$char'.")
494-
end
495-
#---------------------------------# CASE 2: Closing tag of ELEMENT_NODE: </NAME>
496-
char = peek(o, Char)
497-
if char === '/'
498-
closing_tag = readuntil(o, '>')
499-
return get_next(o, (index, depth - 1))
500-
end
501-
#---------------------------------# CASE 3: Opening tag of ELEMENT_NODE: <NAME attributes... >
502-
if isletter(char) || char === '_' # Names can begin with a letter or underscore
503-
tag = read_name(o)
504-
attributes = read_attributes(o)
505-
c = read(o, Char)
506-
if c === '/'
507-
read(o, Char) === '>' || error("Expected '>' after '/' at end of tag.")
508-
elseif c !== '>'
509-
error("Expected '>' at end of tag. Found: '$c'.")
510-
end
511-
nextdepth = depth + (c === '>')
512-
return Node(ELEMENT_NODE; tag, attributes, depth) => (index + 1, nextdepth)
513-
end
514-
#---------------------------------# CASE 4: DECLARATION_NODE: <?xml ... ?>
515-
if char === '?'
516-
skip(o, 1)
517-
tag = read_name(o)
518-
tag == "xml" || error("Expected 'xml' tag. Found: '$tag'.")
519-
attributes = read_attributes(o)
520-
skip_spaces(o)
521-
read(o, Char) === '?' || error("Expected '?>' at end of declaration.")
522-
read(o, Char) === '>' || error("Expected '?>' at end of declaration.")
523-
return Node(DECLARATION_NODE; attributes, depth) => (index + 1, depth)
524-
end
525-
526-
#---------------------------------# CASE 5: Error handling for invalid characters
527-
char = read(o, Char) # same as peek above
528-
char !== '!' && error("Expected character after '<' to be a letter, '?', or '!'. Found: '$char'.")
529-
530-
# Everything after here begins with: <!
531-
532-
#---------------------------------# CASE 6: DTD_NODE: <!DOCTYPE ...>
533-
if peek(o, Char) in "dD"
534-
tag = read_name(o)
535-
tag == "doctype" || tag == "DOCTYPE" || error("Expected 'DOCTYPE' tag. Found: '$tag'.")
536-
content = readuntil(o, '>'; keep=false)
537-
return Node(COMMENT_NODE; content, depth) => (index + 1, depth)
538-
end
539-
540-
#---------------------------------# CASE 7: COMMENT_NODE: <!-- ... -->
541-
char = read(o, Char) # <!
542-
if char === '-'
543-
read(o, Char) === '-' || error("Expected '<!--'. Found: '<!-$char'.")
544-
take!(o)
545-
content = readuntil(o, "-->"; keep=false)
546-
return Node(COMMENT_NODE; content, depth) => (index + 1, depth)
547-
end
548-
549-
#---------------------------------# CASE 8: CDATA_NODE: <![CDATA[ ... ]]>
550-
if char === '['
551-
take!(o)
552-
tag = read_name(o)
553-
tag === "CDATA" || error("Expected 'CDATA' tag. Found: '$tag'.")
554-
read(o, Char) === '[' || error("Expected '[' after 'CDATA'.")
555-
take!(o)
556-
content = readuntil(o, "]]>"; keep=false)
557-
return Node(CDATA_NODE; content, depth) => (index + 1, depth)
558-
end
559-
560-
error("Unknown error. String buffer contains: $(String(take!(o)))")
561-
end
562-
563-
function read_name(o::StreamingIterator)
564-
char = peek(o, Char)
565-
isletter(char) || char === '_' || error("Expected a letter or underscore. Found '$char'.")
566-
read(o, Char)
567-
while true
568-
char = peek(o, Char)
569-
(isletter(char) || isdigit(char) || char in "_-.:") ? read(o, Char) : break
570-
end
571-
return String(take!(o))
572-
end
573-
574-
function read_attributes(o::StreamingIterator)
575-
skip_spaces(o)
576-
peek(o, Char) in "?/>" && return nothing
577-
out = OrderedDict{String,String}()
578-
while true
579-
peek(o, Char) in "?/>" && break
580-
key = read_name(o)
581-
skip_spaces(o)
582-
read(o, Char) === '=' || error("Expected '=' after attribute name.")
583-
skip_spaces(o)
584-
quotechar = read(o, Char)
585-
val = readuntil(o, quotechar; keep=false)
586-
out[key] = val
587-
skip_spaces(o)
588-
take!(o)
589-
end
590-
return out
591-
end
592-
593-
594-
595-
596-
597-
598338

599339

600340

src/iterator.jl

Lines changed: 0 additions & 12 deletions
This file was deleted.

0 commit comments

Comments
 (0)