|
1 | | - |
2 | 1 | mutable struct DTableColumn{T,TT} |
3 | 2 | dtable::DTable |
4 | | - current_chunk::Int |
5 | 3 | col::Int |
6 | 4 | colname::Symbol |
7 | 5 | chunk_lengths::Vector{Int} |
8 | | - current_iterator::Union{Nothing,TT} |
9 | | - chunkstore::Union{Nothing,Vector{T}} |
| 6 | + _chunk::Int |
| 7 | + _iter::Union{Nothing,TT} |
| 8 | + _chunkstore::Union{Nothing,Vector{T}} |
10 | 9 | end |
11 | 10 |
|
12 | | -__ff = (ch, col) -> Tables.getcolumn(Tables.columns(ch), col) |
| 11 | +function getcolumn_chunk(chunk_contents, col::Int) |
| 12 | + return Tables.getcolumn(Tables.columns(chunk_contents), col) |
| 13 | +end |
13 | 14 |
|
14 | | -function DTableColumn(dtable::DTable, col::Int) |
15 | | - column_eltype = Tables.schema(Tables.columns(dtable)).types[col] |
16 | | - iterator_type = fetch(Dagger.spawn((ch, _col) -> typeof(iterate(__ff(ch, _col))), dtable.chunks[1], col)) |
| 15 | +function DTableColumn(d::DTable, col::Int) |
| 16 | + column_eltype = Tables.schema(Tables.columns(d)).types[col] |
| 17 | + |
| 18 | + iterator_type = Nothing |
| 19 | + c_idx = 1 |
| 20 | + while iterator_type === Nothing && c_idx <= nchunks(d) |
| 21 | + iterator_type = fetch(Dagger.spawn( |
| 22 | + (ch, _col) -> typeof(iterate(getcolumn_chunk(ch, _col))), |
| 23 | + d.chunks[c_idx], |
| 24 | + col |
| 25 | + )) |
| 26 | + c_idx += 1 |
| 27 | + end |
17 | 28 |
|
18 | 29 | DTableColumn{column_eltype,iterator_type}( |
19 | | - dtable, |
20 | | - 0, |
| 30 | + d, |
21 | 31 | col, |
22 | | - _columnnames_svector(dtable)[col], |
23 | | - chunk_lengths(dtable), |
| 32 | + _columnnames_svector(d)[col], |
| 33 | + chunk_lengths(d), |
| 34 | + 0, |
24 | 35 | nothing, |
25 | 36 | nothing, |
26 | 37 | ) |
27 | 38 | end |
28 | 39 |
|
29 | 40 |
|
30 | | -function getindex(dtablecolumn::DTableColumn, idx::Int) |
31 | | - chunk_idx = 0 |
32 | | - s = 1 |
33 | | - for (i, e) in enumerate(dtablecolumn.chunk_lengths) |
34 | | - if s <= idx < s + e |
35 | | - chunk_idx = i |
36 | | - break |
37 | | - end |
38 | | - s = s + e |
39 | | - end |
40 | | - chunk_idx == 0 && throw(BoundsError()) |
41 | | - offset = idx - s + 1 |
42 | | - chunk = fetch(Dagger.spawn(__ff, dtablecolumn.dtable.chunks[chunk_idx], dtablecolumn.col)) |
43 | | - |
44 | | - row, iter = iterate(Tables.rows(chunk)) |
45 | | - for _ in 1:(offset-1) |
46 | | - row, iter = iterate(Tables.rows(chunk), iter) |
47 | | - end |
48 | | - Tables.getcolumn(row, dtablecolumn.col) |
49 | | -end |
| 41 | +DTableColumn(d::DTable, col::String) = |
| 42 | + DTableColumn(d, only(indexin([col], string.(_columnnames_svector(d))))) |
| 43 | +DTableColumn(d::DTable, col::Symbol) = DTableColumn(d, string(col)) |
50 | 44 |
|
51 | | -length(dtablecolumn::DTableColumn) = sum(dtablecolumn.chunk_lengths) |
| 45 | +length(dtc::DTableColumn) = sum(dtc.chunk_lengths) |
52 | 46 |
|
53 | 47 |
|
54 | | -function pull_next_chunk(dtablecolumn::DTableColumn, chunkidx::Int) |
55 | | - while dtablecolumn.current_iterator === nothing |
56 | | - chunkidx += 1 |
57 | | - if chunkidx <= length(dtablecolumn.dtable.chunks) |
58 | | - dtablecolumn.chunkstore = |
59 | | - fetch(Dagger.spawn(__ff, dtablecolumn.dtable.chunks[chunkidx], dtablecolumn.col)) |
| 48 | +function pull_next_chunk!(dtc::DTableColumn) |
| 49 | + # find first non-empty chunk |
| 50 | + while dtc._iter === nothing |
| 51 | + dtc._chunk += 1 |
| 52 | + if dtc._chunk <= nchunks(dtc.dtable) |
| 53 | + dtc._chunkstore = fetch(Dagger.spawn( |
| 54 | + getcolumn_chunk, |
| 55 | + dtc.dtable.chunks[dtc._chunk], |
| 56 | + dtc.col |
| 57 | + )) |
60 | 58 | else |
61 | | - return chunkidx |
| 59 | + return nothing |
62 | 60 | end |
63 | | - dtablecolumn.current_iterator = iterate(dtablecolumn.chunkstore) |
| 61 | + # iterate in case this chunk is empty |
| 62 | + dtc._iter = iterate(dtc._chunkstore) |
64 | 63 | end |
65 | | - return chunkidx |
| 64 | + return nothing |
66 | 65 | end |
67 | 66 |
|
68 | 67 |
|
69 | | -function iterate(dtablecolumn::DTableColumn) |
70 | | - if length(dtablecolumn) == 0 |
71 | | - return nothing |
72 | | - end |
73 | | - dtablecolumn.chunkstore = nothing |
74 | | - dtablecolumn.current_iterator = nothing |
75 | | - chunkidx = pull_next_chunk(dtablecolumn, 0) |
76 | | - ci = dtablecolumn.current_iterator |
77 | | - if ci === nothing |
78 | | - return nothing |
79 | | - else |
80 | | - return (ci[1], (chunkidx, ci[2])) |
81 | | - end |
| 68 | +function iterate(dtc::DTableColumn) |
| 69 | + length(dtc) == 0 && return nothing |
| 70 | + |
| 71 | + # on every iteration start reset the cache |
| 72 | + dtc._chunkstore = nothing |
| 73 | + dtc._iter = nothing |
| 74 | + dtc._chunk = 0 |
| 75 | + |
| 76 | + # pull the first chunk |
| 77 | + pull_next_chunk!(dtc) |
| 78 | + |
| 79 | + return dtc._iter |
82 | 80 | end |
83 | 81 |
|
84 | | -function iterate(dtablecolumn::DTableColumn, iter) |
85 | | - (chunkidx, i) = iter |
86 | | - cs = dtablecolumn.chunkstore |
87 | | - ci = nothing |
88 | | - if cs !== nothing |
89 | | - ci = iterate(cs, i) |
90 | | - else |
91 | | - return nothing |
92 | | - end |
93 | | - dtablecolumn.current_iterator = ci |
94 | | - chunkidx = pull_next_chunk(dtablecolumn, chunkidx) |
95 | | - ci = dtablecolumn.current_iterator |
96 | | - if ci === nothing |
97 | | - return nothing |
98 | | - else |
99 | | - return (ci[1], (chunkidx, ci[2])) |
100 | | - end |
| 82 | +function iterate(dtc::DTableColumn, iter) |
| 83 | + dtc._chunkstore === nothing && return nothing |
| 84 | + dtc._iter = iterate(dtc._chunkstore, iter) |
| 85 | + pull_next_chunk!(dtc) |
| 86 | + return dtc._iter |
101 | 87 | end |
0 commit comments