Skip to content

Commit c3fd232

Browse files
authored
Avoid method dispatch ambiguities in DataFrames.jl (#3179)
1 parent 515a348 commit c3fd232

File tree

13 files changed

+167
-36
lines changed

13 files changed

+167
-36
lines changed

NEWS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@
8888
([#3081](https://github.com/JuliaData/DataFrames.jl/pull/3081))
8989
* Make `subset` preserves group ordering when `ungroup=false` like `subset!` already does
9090
([#3094](https://github.com/JuliaData/DataFrames.jl/pull/3094))
91+
* Fix incorrect behavior of `GroupDataFrame` indexing in corner cases
92+
([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
93+
* Fix errors in `insertcols!` when no columns to add are passed
94+
([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
95+
* Fix errors in `minimum` and `maximum` aggregates
96+
when processing `GroupedDataFrame` with `combine` in corner cases
97+
([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
9198

9299
## Performance
93100

src/abstractdataframe/abstractdataframe.jl

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3097,7 +3097,7 @@ julia> insertcols!(df, :b, :d => 7:9, after=true)
30973097
3 │ c 9 4 5 3
30983098
```
30993099
"""
3100-
function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...;
3100+
function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...;
31013101
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
31023102
if !is_column_insertion_allowed(df)
31033103
throw(ArgumentError("insertcols! is only supported for DataFrame, or for " *
@@ -3222,31 +3222,47 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
32223222
return df
32233223
end
32243224

3225-
insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString, <:Any}...;
3225+
insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...;
32263226
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
32273227
insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)...,
32283228
after=after, makeunique=makeunique, copycols=copycols)
32293229

3230-
insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol, <:Any}...;
3230+
insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...;
32313231
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
32323232
insertcols!(df, ncol(df)+1, name_cols..., after=after,
32333233
makeunique=makeunique, copycols=copycols)
32343234

3235-
insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString, <:Any}...;
3235+
insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...;
32363236
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
32373237
insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)...,
32383238
after=after, makeunique=makeunique, copycols=copycols)
32393239

3240-
function insertcols!(df::AbstractDataFrame, col::Int=ncol(df)+1; makeunique::Bool=false, name_cols...)
3241-
if !(0 < col <= ncol(df) + 1)
3242-
throw(ArgumentError("attempt to insert a column to a data frame with " *
3243-
"$(ncol(df)) columns at index $col"))
3240+
function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
3241+
makeunique::Bool=false, copycols::Bool=true)
3242+
if col isa SymbolOrString
3243+
col_ind = Int(columnindex(df, col))
3244+
if col_ind == 0
3245+
throw(ArgumentError("column $col does not exist in data frame"))
3246+
end
3247+
else
3248+
col_ind = Int(col)
32443249
end
3245-
if !isempty(name_cols)
3246-
# an explicit error is thrown as keyword argument was supported in the past
3247-
throw(ArgumentError("inserting columns using a keyword argument is not supported, " *
3248-
"pass a Pair as a positional argument instead"))
3250+
3251+
if after
3252+
col_ind += 1
32493253
end
3254+
3255+
if !(0 < col_ind <= ncol(df) + 1)
3256+
throw(ArgumentError("attempt to insert a column to a data frame with " *
3257+
"$(ncol(df)) columns at index $col_ind"))
3258+
end
3259+
3260+
_drop_all_nonnote_metadata!(parent(df))
3261+
return df
3262+
end
3263+
3264+
function insertcols!(df::AbstractDataFrame; after::Bool=false,
3265+
makeunique::Bool=false, copycols::Bool=true)
32503266
_drop_all_nonnote_metadata!(parent(df))
32513267
return df
32523268
end

src/abstractdataframe/sort.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,10 @@ function Base.issorted(df::AbstractDataFrame, cols=All();
414414
end
415415
end
416416

417+
Base.issorted(::AbstractDataFrame, ::Base.Order.Ordering) =
418+
throw(ArgumentError("second positional argument of `issorted` on " *
419+
"a data frame must be a column selector"))
420+
417421
"""
418422
sort(df::AbstractDataFrame, cols=All();
419423
alg::Union{Algorithm, Nothing}=nothing,

src/groupeddataframe/fastaggregates.jl

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,14 @@ check_aggregate(f::typeof(prod), ::AbstractVector{<:Union{Missing, Number}}) =
1717
Reduce(Base.mul_prod)
1818
check_aggregate(f::typeof(prodskipmissing), ::AbstractVector{<:Union{Missing, Number}}) =
1919
Reduce(Base.mul_prod, !ismissing)
20-
check_aggregate(f::typeof(maximum),
21-
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
22-
check_aggregate(f::typeof(maximum), v::AbstractVector{<:Union{Missing, Real}}) =
23-
eltype(v) === Any ? f : Reduce(max)
24-
check_aggregate(f::typeof(maximumskipmissing),
25-
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
26-
check_aggregate(f::typeof(maximumskipmissing), v::AbstractVector{<:Union{Missing, Real}}) =
27-
eltype(v) === Any ? f : Reduce(max, !ismissing, nothing, true)
28-
check_aggregate(f::typeof(minimum),
29-
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
30-
check_aggregate(f::typeof(minimum), v::AbstractVector{<:Union{Missing, Real}}) =
31-
eltype(v) === Any ? f : Reduce(min)
32-
check_aggregate(f::typeof(minimumskipmissing),
33-
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
34-
check_aggregate(f::typeof(minimumskipmissing), v::AbstractVector{<:Union{Missing, Real}}) =
35-
eltype(v) === Any ? f : Reduce(min, !ismissing, nothing, true)
20+
check_aggregate(f::typeof(maximum), ::AbstractVector{<:Union{Missing, Real}}) =
21+
Reduce(max)
22+
check_aggregate(f::typeof(maximumskipmissing), ::AbstractVector{<:Union{Missing, Real}}) =
23+
Reduce(max, !ismissing, nothing, true)
24+
check_aggregate(f::typeof(minimum), ::AbstractVector{<:Union{Missing, Real}}) =
25+
Reduce(min)
26+
check_aggregate(f::typeof(minimumskipmissing), ::AbstractVector{<:Union{Missing, Real}}) =
27+
Reduce(min, !ismissing, nothing, true)
3628
check_aggregate(f::typeof(mean), ::AbstractVector{<:Union{Missing, Number}}) =
3729
Reduce(Base.add_sum, nothing, /)
3830
check_aggregate(f::typeof(meanskipmissing), ::AbstractVector{<:Union{Missing, Number}}) =

src/groupeddataframe/groupeddataframe.jl

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -748,14 +748,13 @@ Base.IndexStyle(::Type{<:GroupKeys}) = IndexLinear()
748748
return GroupKey(parent(gk), i)
749749
end
750750

751-
752751
#
753752
# Non-standard indexing
754753
#
755754

756755
# Non-standard indexing relies on converting to integer indices first
757756
# The full version (to_indices) is required rather than to_index even though
758-
# GroupedDataFrame behaves as a 1D array due to the behavior of Colon and Not.
757+
# GroupedDataFrame behaves as a 1D array due to the behavior of Not.
759758
# Note that this behavior would be the default if it was <:AbstractArray
760759
function Base.getindex(gd::GroupedDataFrame, idx...)
761760
length(idx) == 1 || throw(ArgumentError("GroupedDataFrame requires a single index"))
@@ -767,6 +766,10 @@ const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple, AbstractDict{Symbol}, A
767766
# All allowed scalar index types
768767
const GroupIndexTypes = Union{Integer, GroupKeyTypes}
769768

769+
# GroupedDataFrame is not a multidimensional array, so it does not support cartesian indexing
770+
Base.to_indices(gd::GroupedDataFrame, (idx,)::Tuple{CartesianIndex}) =
771+
throw(ArgumentError("Invalid index: $idx of type $(typeof(idx))"))
772+
770773
# Find integer index for dictionary keys
771774
function Base.to_index(gd::GroupedDataFrame, key::GroupKey)
772775
gd === parent(key) && return getfield(key, :idx)
@@ -864,13 +867,30 @@ end
864867
# ambiguity in dispatch
865868
function Base.to_indices(gd::GroupedDataFrame,
866869
(idx,)::Tuple{Not{<:Union{BitArray{1}, Vector{Bool}}}})
867-
(findall(!, idx.skip),)
870+
if length(idx.skip) != length(gd)
871+
throw(BoundsError("attempt to index $(length(gd))-group GroupedDataFrame " *
872+
"with $(length(idx.skip))-element Boolean vector"))
873+
end
874+
return (findall(!, idx.skip),)
868875
end
869876
function Base.to_indices(gd::GroupedDataFrame,
870877
(idx,)::Tuple{Not{<:AbstractVector{Bool}}})
871-
(findall(!, idx.skip),)
878+
if length(idx.skip) != length(gd)
879+
throw(BoundsError("attempt to index $(length(gd))-group GroupedDataFrame " *
880+
"with $(length(idx.skip))-element Boolean vector"))
881+
end
882+
return (findall(!, idx.skip),)
872883
end
873884

885+
# Needed to avoid ambiguity
886+
@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:InvertedIndices.NIdx{1}}}) =
887+
throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))
888+
889+
@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:InvertedIndices.NIdx}}) =
890+
throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))
891+
892+
@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:Union{Array{Bool}, BitArray}}}) =
893+
throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))
874894

875895
#
876896
# Dictionary interface

src/other/broadcasting.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.BroadcastStyle)
1717
Base.Broadcast.BroadcastStyle(::Base.Broadcast.BroadcastStyle, ::DataFrameStyle) =
1818
DataFrameStyle()
1919
Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::DataFrameStyle) = DataFrameStyle()
20+
# The method below is added to avoid dispatch ambiguity
21+
Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.Unknown) =
22+
DataFrameStyle()
2023

2124
function copyto_widen!(res::AbstractVector{T}, bc::Base.Broadcast.Broadcasted,
2225
pos, col) where T
@@ -225,6 +228,9 @@ function Base.Broadcast.broadcast_unalias(dest::AbstractDataFrame, src)
225228
return src
226229
end
227230

231+
# The method below is added to avoid dispatch ambiguity
232+
Base.Broadcast.broadcast_unalias(::Nothing, src::AbstractDataFrame) = src
233+
228234
function Base.Broadcast.broadcast_unalias(dest, src::AbstractDataFrame)
229235
wascopied = false
230236
for (i, col) in enumerate(eachcol(src))
@@ -371,6 +377,10 @@ end
371377
Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src) =
372378
Base.Broadcast.broadcast_unalias(parent(dest), src)
373379

380+
# this is currently impossible but is added to avoid potential dispatch ambiguity in the future
381+
Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src::AbstractDataFrame) =
382+
Base.Broadcast.broadcast_unalias(parent(dest), src)
383+
374384
function Base.copyto!(dfr::DataFrameRow, bc::Base.Broadcast.Broadcasted)
375385
bc′ = Base.Broadcast.preprocess(dfr, bc)
376386
for I in eachindex(bc′)

src/subdataframe/subdataframe.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ struct SubDataFrame{D<:AbstractDataFrame, S<:AbstractIndex, T<:AbstractVector{In
7373
rows::T # maps from subdf row indexes to parent row indexes
7474
end
7575

76+
# this method should be never called by DataFrames.jl code, but is added for safety
77+
SubDataFrame(parent::SubDataFrame, colindex::AbstractIndex, rows::AbstractVector{Int}) =
78+
throw(ArgumentError("Creation of a SubDataFrame from a SubDataFrame is not allowed"))
79+
7680
Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector{Int}, cols)
7781
@boundscheck if !checkindex(Bool, axes(parent, 1), rows)
7882
throw(BoundsError(parent, (rows, cols)))

test/broadcasting.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1957,4 +1957,14 @@ end
19571957
@test_throws ArgumentError dfv.c .= [1, 2]
19581958
end
19591959

1960+
@testset "test coverage for corner cases that are not normally called" begin
1961+
@test Base.Broadcast.BroadcastStyle(DataFrames.DataFrameStyle(),
1962+
Base.Broadcast.Unknown()) isa DataFrames.DataFrameStyle
1963+
df = DataFrame(a=1)
1964+
@test Base.Broadcast.broadcast_unalias(nothing, df) === df
1965+
@test Base.Broadcast.broadcast_unalias(df[1, :], df) == df
1966+
@test Base.Broadcast.broadcast_unalias(df[1, :], df) !== df
1967+
@test Base.Broadcast.broadcast_unalias(copy(df)[1, :], df) === df
1968+
end
1969+
19601970
end # module

test/dataframe.jl

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ end
263263
dfc = copy(df)
264264
@test insertcols!(df, 2) == dfc
265265
@test_throws ArgumentError insertcols!(df, 10)
266-
@test_throws ArgumentError insertcols!(df, 2, a=1, b=2)
266+
@test_throws MethodError insertcols!(df, 2, a=1, b=2)
267267

268268
df = DataFrame()
269269
@test insertcols!(df, 1, :x=>[1]) == DataFrame(x=[1])
@@ -348,7 +348,7 @@ end
348348
@test insertcols!(df, "a" => 2, makeunique=true) == DataFrame(a=1, a_1=2)
349349
end
350350

351-
@testset "insertcols!" begin
351+
@testset "insertcols! old tests" begin
352352
df = DataFrame(a=1:3, b=4:6)
353353
df2 = insertcols(df, :c => 1)
354354
@test df == DataFrame(a=1:3, b=4:6)
@@ -361,9 +361,20 @@ end
361361
@test df2[!, 1] === x
362362
end
363363

364-
@testset "unsupported insertcols!" begin
364+
@testset "insertcols! with no cols" begin
365365
df = DataFrame(x=1:2)
366-
@test_throws ArgumentError insertcols!(df, 2, y=2:3)
366+
@test_throws ArgumentError insertcols!(df, 0)
367+
@test insertcols!(df, 2) === df
368+
@test insertcols!(df, 2) == DataFrame(x=1:2)
369+
@test insertcols!(df, :x) == DataFrame(x=1:2)
370+
@test insertcols!(df, "x") == DataFrame(x=1:2)
371+
@test insertcols!(df, "x", after=true, makeunique=true, copycols=true) == DataFrame(x=1:2)
372+
@test insertcols!(df, 0, after=true) == DataFrame(x=1:2)
373+
@test_throws ArgumentError insertcols!(df, 2, after=true)
374+
@test insertcols!(df) === df
375+
@test insertcols!(df) == DataFrame(x=1:2)
376+
@test insertcols!(df, after=true, makeunique=true, copycols=true) == DataFrame(x=1:2)
377+
@test_throws ArgumentError insertcols!(DataFrame(), :b)
367378
end
368379

369380
@testset "insertcols! after" begin

test/grouping.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4276,4 +4276,40 @@ end
42764276
end
42774277
end
42784278

4279+
@testset "maximum and minimum on missing" begin
4280+
df = DataFrame(id=[1,1,2,2], x=fill(missing, 4))
4281+
gdf = groupby_checked(df, :id)
4282+
@test combine(gdf, :x => maximum => :x) DataFrame(id=1:2, x=fill(missing, 2))
4283+
@test combine(gdf, :x => minimum => :x) DataFrame(id=1:2, x=fill(missing, 2))
4284+
@test_throws ArgumentError combine(gdf, :x => maximumskipmissing)
4285+
@test_throws ArgumentError combine(gdf, :x => minimumskipmissing)
4286+
end
4287+
4288+
@testset "corner cases of indexing" begin
4289+
df = DataFrame(id=1:4)
4290+
gdf = groupby_checked(df, :id)
4291+
@test_throws ArgumentError gdf[CartesianIndex(1)]
4292+
@test_throws ArgumentError gdf[CartesianIndex(1, 1)]
4293+
@test_throws ArgumentError gdf[[CartesianIndex(1)]]
4294+
@test_throws ArgumentError gdf[[CartesianIndex(1, 1)]]
4295+
@test_throws ArgumentError gdf[Any[CartesianIndex(1)]]
4296+
@test_throws ArgumentError gdf[Any[CartesianIndex(1, 1)]]
4297+
4298+
@test_throws ArgumentError gdf[Not(CartesianIndex(1))]
4299+
@test_throws ArgumentError gdf[Not(CartesianIndex(1, 1))]
4300+
@test_throws ArgumentError gdf[Not([CartesianIndex(1)])]
4301+
@test_throws ArgumentError gdf[Not([CartesianIndex(1, 1)])]
4302+
@test_throws ArgumentError gdf[Not(Any[CartesianIndex(1)])]
4303+
@test_throws ArgumentError gdf[Not(Any[CartesianIndex(1, 1)])]
4304+
4305+
@test_throws BoundsError gdf[[true]]
4306+
@test_throws BoundsError gdf[Not([true])]
4307+
@test_throws BoundsError gdf[trues(1)]
4308+
@test_throws BoundsError gdf[Not(trues(1))]
4309+
@test_throws BoundsError gdf[view([true], 1:1)]
4310+
@test_throws BoundsError gdf[Not(view([true], 1:1))]
4311+
@test_throws BoundsError gdf[[true true true true]]
4312+
@test_throws ArgumentError gdf[Not([true true true true])]
4313+
end
4314+
42794315
end # module

0 commit comments

Comments
 (0)