Avoid method dispatch ambiguities in DataFrames.jl (#3179)

bkamins · web-flow · commit c3fd232a8bc2 · 2022-09-26T13:39:00.000+02:00
diff --git a/NEWS.md b/NEWS.md
@@ -88,6 +88,13 @@
   ([#3081](https://github.com/JuliaData/DataFrames.jl/pull/3081))
 * Make `subset` preserves group ordering when `ungroup=false` like `subset!` already does
   ([#3094](https://github.com/JuliaData/DataFrames.jl/pull/3094))
+* Fix incorrect behavior of `GroupDataFrame` indexing in corner cases
+  ([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
+* Fix errors in `insertcols!` when no columns to add are passed
+  ([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
+* Fix errors in `minimum` and `maximum` aggregates
+  when processing `GroupedDataFrame` with `combine` in corner cases
+  ([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
 
 ## Performance
 
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -3097,7 +3097,7 @@ julia> insertcols!(df, :b, :d => 7:9, after=true)
    3 │ c         9      4      5      3
 ```
 """
-function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...;
+function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...;
                      after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
     if !is_column_insertion_allowed(df)
         throw(ArgumentError("insertcols! is only supported for DataFrame, or for " *
@@ -3222,31 +3222,47 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
     return df
 end
 
-insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString, <:Any}...;
+insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...;
             after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
     insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)...,
                 after=after, makeunique=makeunique, copycols=copycols)
 
-insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol, <:Any}...;
+insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...;
             after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
     insertcols!(df, ncol(df)+1, name_cols..., after=after,
                 makeunique=makeunique, copycols=copycols)
 
-insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString, <:Any}...;
+insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...;
             after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
     insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)...,
                 after=after, makeunique=makeunique, copycols=copycols)
 
-function insertcols!(df::AbstractDataFrame, col::Int=ncol(df)+1; makeunique::Bool=false, name_cols...)
-    if !(0 < col <= ncol(df) + 1)
-        throw(ArgumentError("attempt to insert a column to a data frame with " *
-                            "$(ncol(df)) columns at index $col"))
+function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
+                     makeunique::Bool=false, copycols::Bool=true)
+    if col isa SymbolOrString
+        col_ind = Int(columnindex(df, col))
+        if col_ind == 0
+            throw(ArgumentError("column $col does not exist in data frame"))
+        end
+    else
+        col_ind = Int(col)
     end
-    if !isempty(name_cols)
-        # an explicit error is thrown as keyword argument was supported in the past
-        throw(ArgumentError("inserting columns using a keyword argument is not supported, " *
-                            "pass a Pair as a positional argument instead"))
+
+    if after
+        col_ind += 1
     end
+
+    if !(0 < col_ind <= ncol(df) + 1)
+        throw(ArgumentError("attempt to insert a column to a data frame with " *
+                            "$(ncol(df)) columns at index $col_ind"))
+    end
+
+    _drop_all_nonnote_metadata!(parent(df))
+    return df
+end
+
+function insertcols!(df::AbstractDataFrame; after::Bool=false,
+                     makeunique::Bool=false, copycols::Bool=true)
     _drop_all_nonnote_metadata!(parent(df))
     return df
 end
diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl
@@ -414,6 +414,10 @@ function Base.issorted(df::AbstractDataFrame, cols=All();
     end
 end
 
+Base.issorted(::AbstractDataFrame, ::Base.Order.Ordering) =
+    throw(ArgumentError("second positional argument of `issorted` on " *
+                        "a data frame must be a column selector"))
+
 """
     sort(df::AbstractDataFrame, cols=All();
          alg::Union{Algorithm, Nothing}=nothing,
diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl
@@ -17,22 +17,14 @@ check_aggregate(f::typeof(prod), ::AbstractVector{<:Union{Missing, Number}}) =
     Reduce(Base.mul_prod)
 check_aggregate(f::typeof(prod∘skipmissing), ::AbstractVector{<:Union{Missing, Number}}) =
     Reduce(Base.mul_prod, !ismissing)
-check_aggregate(f::typeof(maximum),
-                ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
-check_aggregate(f::typeof(maximum), v::AbstractVector{<:Union{Missing, Real}}) =
-    eltype(v) === Any ? f : Reduce(max)
-check_aggregate(f::typeof(maximum∘skipmissing),
-                ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
-check_aggregate(f::typeof(maximum∘skipmissing), v::AbstractVector{<:Union{Missing, Real}}) =
-    eltype(v) === Any ? f : Reduce(max, !ismissing, nothing, true)
-check_aggregate(f::typeof(minimum),
-                ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
-check_aggregate(f::typeof(minimum), v::AbstractVector{<:Union{Missing, Real}}) =
-    eltype(v) === Any ? f : Reduce(min)
-check_aggregate(f::typeof(minimum∘skipmissing),
-                ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
-check_aggregate(f::typeof(minimum∘skipmissing), v::AbstractVector{<:Union{Missing, Real}}) =
-    eltype(v) === Any ? f : Reduce(min, !ismissing, nothing, true)
+check_aggregate(f::typeof(maximum), ::AbstractVector{<:Union{Missing, Real}}) =
+    Reduce(max)
+check_aggregate(f::typeof(maximum∘skipmissing), ::AbstractVector{<:Union{Missing, Real}}) =
+    Reduce(max, !ismissing, nothing, true)
+check_aggregate(f::typeof(minimum), ::AbstractVector{<:Union{Missing, Real}}) =
+    Reduce(min)
+check_aggregate(f::typeof(minimum∘skipmissing), ::AbstractVector{<:Union{Missing, Real}}) =
+    Reduce(min, !ismissing, nothing, true)
 check_aggregate(f::typeof(mean), ::AbstractVector{<:Union{Missing, Number}}) =
     Reduce(Base.add_sum, nothing, /)
 check_aggregate(f::typeof(mean∘skipmissing), ::AbstractVector{<:Union{Missing, Number}}) =
diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
@@ -748,14 +748,13 @@ Base.IndexStyle(::Type{<:GroupKeys}) = IndexLinear()
     return GroupKey(parent(gk), i)
 end
 
-
 #
 # Non-standard indexing
 #
 
 # Non-standard indexing relies on converting to integer indices first
 # The full version (to_indices) is required rather than to_index even though
-# GroupedDataFrame behaves as a 1D array due to the behavior of Colon and Not.
+# GroupedDataFrame behaves as a 1D array due to the behavior of Not.
 # Note that this behavior would be the default if it was <:AbstractArray
 function Base.getindex(gd::GroupedDataFrame, idx...)
     length(idx) == 1 || throw(ArgumentError("GroupedDataFrame requires a single index"))
@@ -767,6 +766,10 @@ const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple, AbstractDict{Symbol}, A
 # All allowed scalar index types
 const GroupIndexTypes = Union{Integer, GroupKeyTypes}
 
+# GroupedDataFrame is not a multidimensional array, so it does not support cartesian indexing
+Base.to_indices(gd::GroupedDataFrame, (idx,)::Tuple{CartesianIndex}) =
+    throw(ArgumentError("Invalid index: $idx of type $(typeof(idx))"))
+
 # Find integer index for dictionary keys
 function Base.to_index(gd::GroupedDataFrame, key::GroupKey)
     gd === parent(key) && return getfield(key, :idx)
@@ -864,13 +867,30 @@ end
 # ambiguity in dispatch
 function Base.to_indices(gd::GroupedDataFrame,
                          (idx,)::Tuple{Not{<:Union{BitArray{1}, Vector{Bool}}}})
-    (findall(!, idx.skip),)
+    if length(idx.skip) != length(gd)
+        throw(BoundsError("attempt to index $(length(gd))-group GroupedDataFrame " *
+                          "with $(length(idx.skip))-element Boolean vector"))
+    end
+    return (findall(!, idx.skip),)
 end
 function Base.to_indices(gd::GroupedDataFrame,
                          (idx,)::Tuple{Not{<:AbstractVector{Bool}}})
-    (findall(!, idx.skip),)
+    if length(idx.skip) != length(gd)
+        throw(BoundsError("attempt to index $(length(gd))-group GroupedDataFrame " *
+                          "with $(length(idx.skip))-element Boolean vector"))
+    end
+    return (findall(!, idx.skip),)
 end
 
+# Needed to avoid ambiguity
+@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:InvertedIndices.NIdx{1}}}) =
+    throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))
+
+@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:InvertedIndices.NIdx}}) =
+    throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))
+
+@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:Union{Array{Bool}, BitArray}}}) =
+    throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))
 
 #
 # Dictionary interface
diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl
@@ -17,6 +17,9 @@ Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.BroadcastStyle)
 Base.Broadcast.BroadcastStyle(::Base.Broadcast.BroadcastStyle, ::DataFrameStyle) =
     DataFrameStyle()
 Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::DataFrameStyle) = DataFrameStyle()
+# The method below is added to avoid dispatch ambiguity
+Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.Unknown) =
+    DataFrameStyle()
 
 function copyto_widen!(res::AbstractVector{T}, bc::Base.Broadcast.Broadcasted,
                        pos, col) where T
@@ -225,6 +228,9 @@ function Base.Broadcast.broadcast_unalias(dest::AbstractDataFrame, src)
     return src
 end
 
+# The method below is added to avoid dispatch ambiguity
+Base.Broadcast.broadcast_unalias(::Nothing, src::AbstractDataFrame) = src
+
 function Base.Broadcast.broadcast_unalias(dest, src::AbstractDataFrame)
     wascopied = false
     for (i, col) in enumerate(eachcol(src))
@@ -371,6 +377,10 @@ end
 Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src) =
     Base.Broadcast.broadcast_unalias(parent(dest), src)
 
+# this is currently impossible but is added to avoid potential dispatch ambiguity in the future
+Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src::AbstractDataFrame) =
+    Base.Broadcast.broadcast_unalias(parent(dest), src)
+
 function Base.copyto!(dfr::DataFrameRow, bc::Base.Broadcast.Broadcasted)
     bc′ = Base.Broadcast.preprocess(dfr, bc)
     for I in eachindex(bc′)
diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl
@@ -73,6 +73,10 @@ struct SubDataFrame{D<:AbstractDataFrame, S<:AbstractIndex, T<:AbstractVector{In
     rows::T # maps from subdf row indexes to parent row indexes
 end
 
+# this method should be never called by DataFrames.jl code, but is added for safety
+SubDataFrame(parent::SubDataFrame, colindex::AbstractIndex, rows::AbstractVector{Int}) =
+    throw(ArgumentError("Creation of a SubDataFrame from a SubDataFrame is not allowed"))
+
 Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector{Int}, cols)
     @boundscheck if !checkindex(Bool, axes(parent, 1), rows)
         throw(BoundsError(parent, (rows, cols)))
diff --git a/test/broadcasting.jl b/test/broadcasting.jl
@@ -1957,4 +1957,14 @@ end
     @test_throws ArgumentError dfv.c .= [1, 2]
 end
 
+@testset "test coverage for corner cases that are not normally called" begin
+    @test Base.Broadcast.BroadcastStyle(DataFrames.DataFrameStyle(),
+                                        Base.Broadcast.Unknown()) isa DataFrames.DataFrameStyle
+    df = DataFrame(a=1)
+    @test Base.Broadcast.broadcast_unalias(nothing, df) === df
+    @test Base.Broadcast.broadcast_unalias(df[1, :], df) == df
+    @test Base.Broadcast.broadcast_unalias(df[1, :], df) !== df
+    @test Base.Broadcast.broadcast_unalias(copy(df)[1, :], df) === df
+end
+
 end # module
diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -263,7 +263,7 @@ end
     dfc = copy(df)
     @test insertcols!(df, 2) == dfc
     @test_throws ArgumentError insertcols!(df, 10)
-    @test_throws ArgumentError insertcols!(df, 2, a=1, b=2)
+    @test_throws MethodError insertcols!(df, 2, a=1, b=2)
 
     df = DataFrame()
     @test insertcols!(df, 1, :x=>[1]) == DataFrame(x=[1])
@@ -348,7 +348,7 @@ end
     @test insertcols!(df, "a" => 2, makeunique=true) == DataFrame(a=1, a_1=2)
 end
 
-@testset "insertcols!" begin
+@testset "insertcols! old tests" begin
     df = DataFrame(a=1:3, b=4:6)
     df2 = insertcols(df, :c => 1)
     @test df == DataFrame(a=1:3, b=4:6)
@@ -361,9 +361,20 @@ end
     @test df2[!, 1] === x
 end
 
-@testset "unsupported insertcols!" begin
+@testset "insertcols! with no cols" begin
     df = DataFrame(x=1:2)
-    @test_throws ArgumentError insertcols!(df, 2, y=2:3)
+    @test_throws ArgumentError insertcols!(df, 0)
+    @test insertcols!(df, 2) === df
+    @test insertcols!(df, 2) == DataFrame(x=1:2)
+    @test insertcols!(df, :x) == DataFrame(x=1:2)
+    @test insertcols!(df, "x") == DataFrame(x=1:2)
+    @test insertcols!(df, "x", after=true, makeunique=true, copycols=true) == DataFrame(x=1:2)
+    @test insertcols!(df, 0, after=true) == DataFrame(x=1:2)
+    @test_throws ArgumentError insertcols!(df, 2, after=true)
+    @test insertcols!(df) === df
+    @test insertcols!(df) == DataFrame(x=1:2)
+    @test insertcols!(df, after=true, makeunique=true, copycols=true) == DataFrame(x=1:2)
+    @test_throws ArgumentError insertcols!(DataFrame(), :b)
 end
 
 @testset "insertcols! after" begin
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -4276,4 +4276,40 @@ end
     end
 end
 
+@testset "maximum and minimum on missing" begin
+    df = DataFrame(id=[1,1,2,2], x=fill(missing, 4))
+    gdf = groupby_checked(df, :id)
+    @test combine(gdf, :x => maximum => :x) ≅ DataFrame(id=1:2, x=fill(missing, 2))
+    @test combine(gdf, :x => minimum => :x) ≅ DataFrame(id=1:2, x=fill(missing, 2))
+    @test_throws ArgumentError combine(gdf, :x => maximum∘skipmissing)
+    @test_throws ArgumentError combine(gdf, :x => minimum∘skipmissing)
+end
+
+@testset "corner cases of indexing" begin
+    df = DataFrame(id=1:4)
+    gdf = groupby_checked(df, :id)
+    @test_throws ArgumentError gdf[CartesianIndex(1)]
+    @test_throws ArgumentError gdf[CartesianIndex(1, 1)]
+    @test_throws ArgumentError gdf[[CartesianIndex(1)]]
+    @test_throws ArgumentError gdf[[CartesianIndex(1, 1)]]
+    @test_throws ArgumentError gdf[Any[CartesianIndex(1)]]
+    @test_throws ArgumentError gdf[Any[CartesianIndex(1, 1)]]
+
+    @test_throws ArgumentError gdf[Not(CartesianIndex(1))]
+    @test_throws ArgumentError gdf[Not(CartesianIndex(1, 1))]
+    @test_throws ArgumentError gdf[Not([CartesianIndex(1)])]
+    @test_throws ArgumentError gdf[Not([CartesianIndex(1, 1)])]
+    @test_throws ArgumentError gdf[Not(Any[CartesianIndex(1)])]
+    @test_throws ArgumentError gdf[Not(Any[CartesianIndex(1, 1)])]
+
+    @test_throws BoundsError gdf[[true]]
+    @test_throws BoundsError gdf[Not([true])]
+    @test_throws BoundsError gdf[trues(1)]
+    @test_throws BoundsError gdf[Not(trues(1))]
+    @test_throws BoundsError gdf[view([true], 1:1)]
+    @test_throws BoundsError gdf[Not(view([true], 1:1))]
+    @test_throws BoundsError gdf[[true true true true]]
+    @test_throws ArgumentError gdf[Not([true true true true])]
+end
+
 end # module
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -14,6 +14,20 @@ else
     @show Threads.nthreads()
 end
 
+ambiguities_vec = Test.detect_ambiguities(DataFrames, recursive=true)
+if !isempty(ambiguities_vec)
+    @error "Method ambiguities:"
+    display(ambiguities_vec)
+    throw(AssertionError("method dispatch ambiguities found"))
+end
+
+unbound_args_vec = Test.detect_unbound_args(DataFrames, recursive=true)
+if !isempty(unbound_args_vec)
+    @error "Unbound type parameters:"
+    display(unbound_args_vec)
+    throw(AssertionError("unbound type parameters found"))
+end
+
 my_tests = ["utils.jl",
             "cat.jl",
             "data.jl",
diff --git a/test/sort.jl b/test/sort.jl
@@ -322,6 +322,7 @@ end
         @test issorted(df, rev=fill(false, ncol(df)))
         @test issorted(df, order=Base.Forward)
         @test issorted(df, order=fill(Base.Forward, ncol(df)))
+        @test_throws ArgumentError issorted(df, Base.Order.Forward)
 
         @test issorted(df, :x, by=identity)
         @test issorted(df, :x, by=[identity])
diff --git a/test/subdataframe.jl b/test/subdataframe.jl
@@ -301,4 +301,10 @@ end
     end
 end
 
+@testset "disallowed use of SubDataFrame constructor" begin
+    df = DataFrame(a=1)
+    sdf = view(df, :, :)
+    @test_throws ArgumentError SubDataFrame(sdf, DataFrames.index(sdf), [1])
+end
+
 end # module