@@ -3,6 +3,7 @@ module XML
3
3
using OrderedCollections: OrderedDict
4
4
using Base: @kwdef , StringVector
5
5
using Mmap
6
+ using Tables
6
7
# import AbstractTrees: print_tree, printnode, children
7
8
8
9
export Document, DTD, Declaration, Comment, CData, Element,
@@ -42,17 +43,20 @@ unescape(x::AbstractString) = replace(x, reverse.(escape_chars)...)
42
43
struct Tokens
43
44
filename:: String
44
45
data:: Vector{UInt8}
45
- name_map:: Dict{String, Symbol}
46
- Tokens (filename:: String ) = new (filename, Mmap. mmap (filename), Dict {String, Symbol} ())
46
+ Tokens (filename:: String ) = new (filename, Mmap. mmap (filename))
47
47
end
48
+ Tables. rows (o:: Tokens ) = o
49
+ Tables. schema (o:: Tokens ) = Tables. Schema (fieldnames (TokenData), fieldtypes (TokenData))
48
50
49
51
struct TokenData
50
52
tok:: XMLToken
51
53
depth:: Int
54
+ pos:: Int
52
55
data:: typeof (view (Vector {UInt8} (" example" ), 1 : 2 ))
53
56
end
54
57
function Base. show (io:: IO , o:: TokenData )
55
- print (io, o. tok, ' (' , o. depth, " ): " )
58
+ print (io, o. tok)
59
+ printstyled (io, " (depth=" , o. depth, " , " , " pos=" , o. pos, " ) : " ; color= :light_black )
56
60
printstyled (io, String (copy (o. data)); color= :light_green )
57
61
end
58
62
@@ -107,7 +111,7 @@ function Base.iterate(o::Tokens, state = (1, 1))
107
111
error (" Unexpected character: $c " )
108
112
end
109
113
tok === TOK_UNKNOWN && error (" Token isn't identified: $(String (data[i: j])) " )
110
- return TokenData (tok, depth, view (o. data, i: j)) => (j + 1 , depth)
114
+ return TokenData (tok, depth, i, view (o. data, i: j)) => (j + 1 , depth)
111
115
end
112
116
113
117
@@ -116,6 +120,8 @@ struct Rows
116
120
tokens:: Tokens
117
121
end
118
122
Rows (filename:: String ) = Rows (Tokens (filename))
123
+ Tables. rows (o:: Rows ) = o
124
+ Tables. schema (o:: Rows ) = Tables. Schema (fieldnames (RowNode), fieldtypes (RowNode))
119
125
120
126
struct RowNode
121
127
depth:: Int
@@ -160,7 +166,6 @@ function _print_attrs(io::IO, o)
160
166
! isnothing (o. attributes) && printstyled (io, [" $k =\" $v \" " for (k,v) in o. attributes]. .. ; color= :light_black )
161
167
end
162
168
163
-
164
169
Base. IteratorSize (:: Type{Rows} ) = Base. SizeUnknown ()
165
170
Base. eltype (:: Type{Rows} ) = RowNode
166
171
Base. isdone (o:: Rows , pos) = isdone (o. file, pos)
@@ -307,122 +312,13 @@ function _show_node(io, o)
307
312
error (" Unreachable reached" )
308
313
end
309
314
end
310
- function _print_attrs (io:: IO , o)
311
- ! isnothing (o. attributes) && printstyled (io, [" $k =\" $v \" " for (k,v) in o. attributes]. .. ; color= :light_black )
312
- end
313
- function _print_n_children (io:: IO , o)
314
- hasfield (typeof (o), :children ) && ! isnothing (o. children) && printstyled (io, " (" , length (o. children), " children)" , color= :light_black )
315
- end
316
315
317
316
Base. getindex (o:: Node , i:: Integer ) = o. children[i]
318
317
Base. setindex! (o:: Node , val, i:: Integer ) = o. children[i] = Node (val)
319
318
Base. lastindex (o:: Node ) = lastindex (o. children)
320
319
321
320
Base. push! (a:: Node , b:: Node ) = push! (a. children, b)
322
321
323
- # -----------------------------------------------------------------------------# FileChunk
324
- # struct FileChunk
325
- # file::File
326
- # rng::UnitRange{Int}
327
- # nodetype::NodeType
328
- # end
329
- # data(o::FileChunk) = (data = view(o.file.data, o.rng), nodetype = o.nodetype)
330
-
331
- # Base.IteratorSize(::Type{FileChunk}) = Base.SizeUnknown()
332
- # Base.eltype(::Type{FileChunk}) = typeof(())
333
-
334
-
335
-
336
- # Base.IteratorSize(::Type{File}) = Base.SizeUnknown()
337
- # Base.eltype(::Type{File}) = Node
338
-
339
- # # state = (position, depth)
340
- # function Base.iterate(o::File, state=(1, 0))
341
- # pos, depth = state
342
- # pos = findnext(x -> !isspace(Char(x)), o.data, pos)
343
- # isnothing(pos) && return nothing
344
-
345
- # char = Char(o.data[pos])
346
- # isletter(char) && return get_text(o, pos, depth)
347
-
348
- # char != '<' && error("Unexpected character: $char")
349
- # pos += 1
350
- # char = Char(o.data[pos])
351
- # if char === '/'
352
- # depth -= 1
353
- # pos = findnext(x -> x == UInt8('>'), o.data, pos)
354
- # return iterate(o, (pos + 1, depth))
355
- # end
356
- # isletter(char) && return get_element(o, pos, depth)
357
- # char === '?' && return get_declaration(o, pos + 3, depth)
358
- # char != '!' && error("Unexpected character: $char")
359
-
360
- # pos += 1
361
- # char = Char(o.data[pos])
362
- # char === '-' && return get_comment(o, pos + 1, depth)
363
- # char === '[' && return get_cdata(o, pos + length("CDATA["), depth)
364
- # char === 'D' && return get_dtd(o, pos + length("OCTYPE"), depth)
365
- # end
366
-
367
- # function get_text(o::File, pos, depth)
368
- # pos2 = findnext(x -> x == UInt8('<'), o.data, pos) - 1
369
- # return Node(TEXT_NODE; depth, content=unescape(String(o.data[pos:pos2]))) => (pos2 + 1, depth)
370
- # end
371
- # function get_element(o::File, pos, depth)
372
- # tag, pos = get_name(o, pos)
373
- # attributes, pos = get_attributes(o, pos)
374
- # pos = findnext(==(UInt8('>')), o.data, pos)
375
- # o.data[pos-1] !== UInt8('/') && (depth += 1)
376
- # return Node(ELEMENT_NODE; depth, tag, attributes) => (pos + 1, depth)
377
- # end
378
- # function get_declaration(o::File, pos, depth)
379
- # attributes, pos = get_attributes(o, pos)
380
- # return Node(DECLARATION_NODE; depth, attributes) => (pos, depth)
381
- # end
382
- # function get_comment(o::File, pos, depth)
383
- # a, b = extrema(findnext(Vector{UInt8}("-->"), o.data, pos))
384
- # content = o.data[pos:a-1]
385
- # return Node(COMMENT_NODE; depth, content) => (b + 1, depth)
386
- # end
387
- # function get_cdata(o::File, pos, depth)
388
- # a, b = extrema(findnext(Vector{UInt8}("]]>"), o.data, pos))
389
- # content = String(o.data[pos:a-1])
390
- # return Node(CDATA_NODE; depth, content) => (b + 1, depth)
391
- # end
392
- # function get_dtd(o::File, pos, depth)
393
- # pos2 = findnext(==(UInt8('>')), o.data, pos)
394
- # content = String(o.data[pos:pos2-1])
395
- # return Node(DTD_NODE; depth, content) => (pos2 + 1, depth)
396
- # end
397
-
398
- # function get_name(o::File, pos)
399
- # pos2 = findnext(x -> !(isletter(Char(x)) || isdigit(Char(x)) || x ∉ Vector{UInt8}("._-:")), o.data, pos)
400
- # name = String(o.data[pos:pos2-1])
401
- # return name, pos2
402
- # end
403
-
404
- # function get_attributes(o::File, pos)
405
- # out = OrderedDict{Symbol, String}()
406
- # pos2 = pos
407
- # while true
408
- # if isspace(Char(o.data[pos2]))
409
- # pos += 1
410
- # continue
411
- # end
412
- # o.data[pos] == UInt8('>') && break
413
- # key, pos = get_name(o, pos)
414
- # @info key, pos
415
- # pos = findnext(x -> Char(x) === '"' || Char(x) === ''', o.data, pos) + 1
416
- # quotechar = o.data[pos]
417
- # pos2 = findnext(==(quotechar), o.data, pos)
418
- # value = String(o.data[pos:pos2-1])
419
- # out[Symbol(key)] = value
420
- # pos = pos2 + 1
421
- # end
422
- # return out, pos2 + 1
423
- # end
424
-
425
-
426
322
Base. read (filename:: AbstractString , :: Type{Node} ) = open (io -> read (io, Node), filename)
427
323
428
324
function Base. read (io:: IO , :: Type{Node} )
@@ -439,162 +335,6 @@ end
439
335
440
336
_with_children (o:: Node ) = isnothing (o. children) ? Node (o, children= Node[]) : o
441
337
442
- # -----------------------------------------------------------------------------# StreamingIterator
443
- @kwdef struct StreamingIterator
444
- io:: IO
445
- buf:: IOBuffer = IOBuffer ()
446
- debug:: Bool = false
447
- end
448
- function StreamingIterator (io:: IO ; kw... )
449
- isreadable (io) || error (" IO input to StreamingIterator is not readable." )
450
- StreamingIterator (; io, kw... )
451
- end
452
-
453
- Base. eltype (:: Type{<:StreamingIterator} ) = Node
454
- Base. IteratorSize (:: Type{<:StreamingIterator} ) = Base. SizeUnknown ()
455
- Base. isdone (itr:: StreamingIterator , state... ) = eof (itr. io)
456
-
457
- function Base. read (o:: StreamingIterator , x... )
458
- item = read (o. io, x... )
459
- write (o. buf, item)
460
- return item
461
- end
462
- Base. readuntil (o:: StreamingIterator , x; keep= false ) = readuntil (o. io, x; keep)
463
- Base. peek (o:: StreamingIterator , x) = peek (o. io, x)
464
- skip_spaces (o:: StreamingIterator ) = skipchars (isspace, o. io)
465
- Base. skip (o:: StreamingIterator , n:: Integer ) = skip (o. io, n)
466
- # Base.readeach(o::StreamingIterator, T) = readeach(o.io, T)
467
- Base. take! (o:: StreamingIterator ) = take! (o. buf)
468
-
469
- # state = (index,depth)
470
- function Base. iterate (o:: StreamingIterator , state = (0 , 1 ))
471
- state[1 ] == 0 && seekstart (o. io)
472
- next, state2 = get_next (o, state)
473
- return isnothing (next) ? nothing : (next, state2)
474
- end
475
-
476
- function get_next (o:: StreamingIterator , state)
477
- skip_spaces (o)
478
- Base. isdone (o) && return (nothing , nothing )
479
- take! (o) # ensure buffer starts from scratch
480
- index, depth = state
481
- # ---------------------------------# CASE 1: TEXT_NODE
482
- char = peek (o, Char)
483
- if char != = ' <'
484
- char = read (o, Char)
485
- while true
486
- peek (o, Char) === ' <' ? break : read (o, Char)
487
- end
488
- content = String (take! (o))
489
- return Node (TEXT_NODE; content, depth) => (index + 1 , depth)
490
- elseif char === ' <'
491
- skip (o, 1 )
492
- else
493
- error (" Expected a letter (text node) or '<'. Found: '$char '." )
494
- end
495
- # ---------------------------------# CASE 2: Closing tag of ELEMENT_NODE: </NAME>
496
- char = peek (o, Char)
497
- if char === ' /'
498
- closing_tag = readuntil (o, ' >' )
499
- return get_next (o, (index, depth - 1 ))
500
- end
501
- # ---------------------------------# CASE 3: Opening tag of ELEMENT_NODE: <NAME attributes... >
502
- if isletter (char) || char === ' _' # Names can begin with a letter or underscore
503
- tag = read_name (o)
504
- attributes = read_attributes (o)
505
- c = read (o, Char)
506
- if c === ' /'
507
- read (o, Char) === ' >' || error (" Expected '>' after '/' at end of tag." )
508
- elseif c != = ' >'
509
- error (" Expected '>' at end of tag. Found: '$c '." )
510
- end
511
- nextdepth = depth + (c === ' >' )
512
- return Node (ELEMENT_NODE; tag, attributes, depth) => (index + 1 , nextdepth)
513
- end
514
- # ---------------------------------# CASE 4: DECLARATION_NODE: <?xml ... ?>
515
- if char === ' ?'
516
- skip (o, 1 )
517
- tag = read_name (o)
518
- tag == " xml" || error (" Expected 'xml' tag. Found: '$tag '." )
519
- attributes = read_attributes (o)
520
- skip_spaces (o)
521
- read (o, Char) === ' ?' || error (" Expected '?>' at end of declaration." )
522
- read (o, Char) === ' >' || error (" Expected '?>' at end of declaration." )
523
- return Node (DECLARATION_NODE; attributes, depth) => (index + 1 , depth)
524
- end
525
-
526
- # ---------------------------------# CASE 5: Error handling for invalid characters
527
- char = read (o, Char) # same as peek above
528
- char != = ' !' && error (" Expected character after '<' to be a letter, '?', or '!'. Found: '$char '." )
529
-
530
- # Everything after here begins with: <!
531
-
532
- # ---------------------------------# CASE 6: DTD_NODE: <!DOCTYPE ...>
533
- if peek (o, Char) in " dD"
534
- tag = read_name (o)
535
- tag == " doctype" || tag == " DOCTYPE" || error (" Expected 'DOCTYPE' tag. Found: '$tag '." )
536
- content = readuntil (o, ' >' ; keep= false )
537
- return Node (COMMENT_NODE; content, depth) => (index + 1 , depth)
538
- end
539
-
540
- # ---------------------------------# CASE 7: COMMENT_NODE: <!-- ... -->
541
- char = read (o, Char) # <!
542
- if char === ' -'
543
- read (o, Char) === ' -' || error (" Expected '<!--'. Found: '<!-$char '." )
544
- take! (o)
545
- content = readuntil (o, " -->" ; keep= false )
546
- return Node (COMMENT_NODE; content, depth) => (index + 1 , depth)
547
- end
548
-
549
- # ---------------------------------# CASE 8: CDATA_NODE: <![CDATA[ ... ]]>
550
- if char === ' ['
551
- take! (o)
552
- tag = read_name (o)
553
- tag === " CDATA" || error (" Expected 'CDATA' tag. Found: '$tag '." )
554
- read (o, Char) === ' [' || error (" Expected '[' after 'CDATA'." )
555
- take! (o)
556
- content = readuntil (o, " ]]>" ; keep= false )
557
- return Node (CDATA_NODE; content, depth) => (index + 1 , depth)
558
- end
559
-
560
- error (" Unknown error. String buffer contains: $(String (take! (o))) " )
561
- end
562
-
563
- function read_name (o:: StreamingIterator )
564
- char = peek (o, Char)
565
- isletter (char) || char === ' _' || error (" Expected a letter or underscore. Found '$char '." )
566
- read (o, Char)
567
- while true
568
- char = peek (o, Char)
569
- (isletter (char) || isdigit (char) || char in " _-.:" ) ? read (o, Char) : break
570
- end
571
- return String (take! (o))
572
- end
573
-
574
- function read_attributes (o:: StreamingIterator )
575
- skip_spaces (o)
576
- peek (o, Char) in " ?/>" && return nothing
577
- out = OrderedDict {String,String} ()
578
- while true
579
- peek (o, Char) in " ?/>" && break
580
- key = read_name (o)
581
- skip_spaces (o)
582
- read (o, Char) === ' =' || error (" Expected '=' after attribute name." )
583
- skip_spaces (o)
584
- quotechar = read (o, Char)
585
- val = readuntil (o, quotechar; keep= false )
586
- out[key] = val
587
- skip_spaces (o)
588
- take! (o)
589
- end
590
- return out
591
- end
592
-
593
-
594
-
595
-
596
-
597
-
598
338
599
339
600
340
0 commit comments