Skip to content

Commit 6b5e4d4

Browse files
authored
DataFrames parsing update (#19)
1 parent ca37a70 commit 6b5e4d4

File tree

3 files changed

+51
-159
lines changed

3 files changed

+51
-159
lines changed

src/DTables.jl

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,15 @@ module DTables
66

77
using Dagger: Dagger
88
using DataAPI: BroadcastedSelector
9-
using DataFrames: AsTable, ByRow, ColumnIndex, MultiColumnIndex, normalize_selection, Index
9+
using DataFrames:
10+
AbstractDataFrame,
11+
AsTable,
12+
ByRow,
13+
ColumnIndex,
14+
MultiColumnIndex,
15+
normalize_selection,
16+
Index,
17+
make_pair_concrete
1018
using InvertedIndices: BroadcastedInvertedIndex
1119
using SentinelArrays: ChainedVector
1220
using TableOperations: TableOperations
@@ -45,12 +53,13 @@ import Base:
4553
import DataAPI: leftjoin, ncol, nrow, innerjoin
4654
import Tables:
4755
columnaccess, columnnames, columns, getcolumn, istable, partitions, rowaccess, rows, schema
56+
import DataFrames: broadcast_pair, select
4857

4958
############################################################################################
5059
# Export
5160
############################################################################################
5261

53-
export DTable, DTableColumn, innerjoin, leftjoin, tabletype, tabletype!, trim, trim!
62+
export DTable, DTableColumn, innerjoin, leftjoin, select, tabletype, tabletype!, trim, trim!
5463

5564
############################################################################################
5665

src/table/dataframes_interface.jl

Lines changed: 36 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,37 @@
1-
function make_pair_concrete(@nospecialize(x::Pair))
2-
return make_pair_concrete(x.first) => make_pair_concrete(x.second)
1+
struct DTableAbstractDataFrameWrapper <: AbstractDataFrame
2+
d::DTable
33
end
4-
make_pair_concrete(@nospecialize(x)) = x
54

6-
broadcast_pair(df::DTable, @nospecialize(p::Any)) = p
5+
broadcast_pair(df::DTable, p) = broadcast_pair(DTableAbstractDataFrameWrapper(df), p)
76

8-
# Copied as is from DataFrames.jl
9-
function broadcast_pair(df::DTable, @nospecialize(p::Pair))
10-
src, second = p
11-
src_broadcast = src isa Union{BroadcastedInvertedIndex,BroadcastedSelector}
12-
second_broadcast = second isa Union{BroadcastedInvertedIndex,BroadcastedSelector}
13-
if second isa Pair
14-
fun, dst = second
15-
dst_broadcast = dst isa Union{BroadcastedInvertedIndex,BroadcastedSelector}
16-
if src_broadcast || dst_broadcast
17-
new_src = src_broadcast ? names(df, src.sel) : src
18-
new_dst = dst_broadcast ? names(df, dst.sel) : dst
19-
new_p = new_src .=> fun .=> new_dst
20-
return isempty(new_p) ? [] : new_p
21-
else
22-
return p
23-
end
24-
else
25-
if src_broadcast || second_broadcast
26-
new_src = src_broadcast ? names(df, src.sel) : src
27-
new_second = second_broadcast ? names(df, second.sel) : second
28-
new_p = new_src .=> new_second
29-
return isempty(new_p) ? [] : new_p
30-
else
31-
return p
32-
end
33-
end
7+
# Not copied - full custom implementation
8+
# There's a copymetadata here now
9+
function manipulate(
10+
dt::DTable, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool, renamecols::Bool
11+
)
12+
colidx = first(args)
13+
colname = columnnames(columns(dt))[colidx]
14+
return map(r -> (; colname => getcolumn(r, colidx)), dt)
3415
end
3516

36-
# this is needed in broadcasting when one of dimensions has length 0
37-
# as then broadcasting produces Matrix{Any} rather than Matrix{<:Pair}
38-
broadcast_pair(df::DTable, @nospecialize(p::AbstractMatrix)) = isempty(p) ? [] : p
39-
4017
# Copied as is from DataFrames.jl
41-
function broadcast_pair(df::DTable, @nospecialize(p::AbstractVecOrMat{<:Pair}))
42-
isempty(p) && return []
43-
need_broadcast = false
44-
45-
src = first.(p)
46-
first_src = first(src)
47-
if first_src isa Union{BroadcastedInvertedIndex,BroadcastedSelector}
48-
if any(!=(first_src), src)
49-
throw(
50-
ArgumentError(
51-
"when broadcasting column selector it must " * "have a constant value"
52-
),
53-
)
54-
end
55-
need_broadcast = true
56-
new_names = names(df, first_src.sel)
57-
if !(length(new_names) == size(p, 1) || size(p, 1) == 1)
58-
throw(
59-
ArgumentError(
60-
"broadcasted dimension does not match the " * "number of selected columns"
61-
),
62-
)
63-
end
64-
new_src = new_names
65-
else
66-
new_src = src
67-
end
68-
69-
second = last.(p)
70-
first_second = first(second)
71-
if first_second isa Union{BroadcastedInvertedIndex,BroadcastedSelector}
72-
if any(!=(first_second), second)
73-
throw(
74-
ArgumentError(
75-
"when using broadcasted column selector it " * "must have a constant value"
76-
),
77-
)
78-
end
79-
need_broadcast = true
80-
new_names = names(df, first_second.sel)
81-
if !(length(new_names) == size(p, 1) || size(p, 1) == 1)
82-
throw(
83-
ArgumentError(
84-
"broadcasted dimension does not match the " * "number of selected columns"
85-
),
86-
)
87-
end
88-
new_second = new_names
18+
function manipulate(
19+
df::DTable, c::MultiColumnIndex; copycols::Bool, keeprows::Bool, renamecols::Bool
20+
)
21+
if c isa AbstractVector{<:Pair}
22+
return manipulate(df, c...; copycols=copycols, keeprows=keeprows, renamecols=renamecols)
8923
else
90-
if first_second isa Pair
91-
fun, dst = first_second
92-
if dst isa Union{BroadcastedInvertedIndex,BroadcastedSelector}
93-
if !all(x -> x isa Pair && last(x) == dst, second)
94-
throw(
95-
ArgumentError(
96-
"when using broadcasted column selector " *
97-
"it must have a constant value",
98-
),
99-
)
100-
end
101-
need_broadcast = true
102-
new_names = names(df, dst.sel)
103-
if !(length(new_names) == size(p, 1) || size(p, 1) == 1)
104-
throw(
105-
ArgumentError(
106-
"broadcasted dimension does not match the " *
107-
"number of selected columns",
108-
),
109-
)
110-
end
111-
new_dst = new_names
112-
new_second = first.(second) .=> new_dst
113-
else
114-
new_second = second
115-
end
116-
else
117-
new_second = second
118-
end
24+
return manipulate(
25+
df, index(df)[c]; copycols=copycols, keeprows=keeprows, renamecols=renamecols
26+
)
11927
end
28+
end
12029

121-
if need_broadcast
122-
new_p = new_src .=> new_second
123-
return isempty(new_p) ? [] : new_p
124-
else
125-
return p
126-
end
30+
# Copied as is from DataFrames.jl
31+
function manipulate(df::DTable, c::ColumnIndex; copycols::Bool, keeprows::Bool, renamecols::Bool)
32+
return manipulate(
33+
df, Int[index(df)[c]]; copycols=copycols, keeprows=keeprows, renamecols=renamecols
34+
)
12735
end
12836

12937
# Copied as is from DataFrames.jl
@@ -138,12 +46,10 @@ function manipulate(
13846
push!(cs_vec, v)
13947
end
14048
end
141-
return _manipulate(
142-
df,
143-
Any[normalize_selection(index(df), make_pair_concrete(c), renamecols) for c in cs_vec],
144-
copycols,
145-
keeprows,
146-
)
49+
normalized_cs = Any[
50+
normalize_selection(index(df), make_pair_concrete(c), renamecols) for c in cs_vec
51+
]
52+
return _manipulate(df, normalized_cs, copycols, keeprows)
14753
end
14854

14955
# Not copied - full custom implementation
@@ -216,35 +122,6 @@ function _manipulate(df::DTable, normalized_cs::Vector{Any}, copycols::Bool, kee
216122
return rd
217123
end
218124

219-
# Not copied - full custom implementation
220-
function manipulate(
221-
dt::DTable, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool, renamecols::Bool
222-
)
223-
colidx = first(args)
224-
colname = columnnames(columns(dt))[colidx]
225-
return map(r -> (; colname => getcolumn(r, colidx)), dt)
226-
end
227-
228-
# Copied as is from DataFrames.jl
229-
function manipulate(
230-
df::DTable, c::MultiColumnIndex; copycols::Bool, keeprows::Bool, renamecols::Bool
231-
)
232-
if c isa AbstractVector{<:Pair}
233-
return manipulate(df, c...; copycols=copycols, keeprows=keeprows, renamecols=renamecols)
234-
else
235-
return manipulate(
236-
df, index(df)[c]; copycols=copycols, keeprows=keeprows, renamecols=renamecols
237-
)
238-
end
239-
end
240-
241-
# Copied as is from DataFrames.jl
242-
function manipulate(df::DTable, c::ColumnIndex; copycols::Bool, keeprows::Bool, renamecols::Bool)
243-
return manipulate(
244-
df, Int[index(df)[c]]; copycols=copycols, keeprows=keeprows, renamecols=renamecols
245-
)
246-
end
247-
248125
"""
249126
select(df::DTable, args...; copycols::Bool=true, renamecols::Bool=true)
250127
@@ -259,7 +136,13 @@ please file an issue with reproduction steps and data.
259136
260137
Please refer to DataFrames documentation for more details on usage.
261138
"""
262-
function select(df::DTable, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true)
139+
function select(
140+
df::DTable,
141+
@nospecialize(args...);
142+
copycols::Bool=true,
143+
renamecols::Bool=true,
144+
threads::Bool=true,
145+
)
263146
return manipulate(
264147
df,
265148
map(x -> broadcast_pair(df, x), args)...;

test/table_dataframes.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ using SentinelArrays: ChainedVector
1313
df = fetch(dt, DataFrame)
1414

1515
t = (args...) -> begin
16-
dt_01 = DTables.select(dt, args...)
17-
df_01 = DataFrames.select(df, args...)
16+
dt_01 = select(dt, args...)
17+
df_01 = select(df, args...)
1818

1919
result = try
2020
all(isapprox.(Tables.columns(df_01), Tables.columns(fetch(dt_01, DataFrame))))
@@ -49,7 +49,7 @@ using SentinelArrays: ChainedVector
4949
# @test # t(AsTable([:a, :b]) => identity) # this should technically fail on DTables
5050
@test t(AsTable([:a, :b]) => identity => AsTable)
5151
@test t([] => ByRow(() -> 1) => :x)
52-
@test fetch(DTables.select(dt, [] => ByRow(rand) => :x)).x isa ChainedVector{Float64, Vector{Float64}}
53-
@test fetch(DTables.select(dt, [] => (() -> rand(s)) => :x)).x isa ChainedVector{Float64, Vector{Float64}}
52+
@test fetch(select(dt, [] => ByRow(rand) => :x)).x isa ChainedVector{Float64, Vector{Float64}}
53+
@test fetch(select(dt, [] => (() -> rand(s)) => :x)).x isa ChainedVector{Float64, Vector{Float64}}
5454
end
5555
end

0 commit comments

Comments
 (0)