Allow predicate in Cols (#2881)

bkamins · web-flow · commit df1b41710fd5 · 2021-09-20T20:09:30.000+02:00
diff --git a/NEWS.md b/NEWS.md
@@ -70,6 +70,8 @@
 * the `DataFrame` constructor when matrix is passed to it as a first
   argument now allows `copycols` keyword argument
   ([#2859](https://github.com/JuliaData/DataFrames.jl/pull/2859))
+* `Cols` now accepts a predicate accepting column names as strings.
+  ([#2881](https://github.com/JuliaData/DataFrames.jl/pull/2881))
 
 ## Bug fixes
 
diff --git a/Project.toml b/Project.toml
@@ -25,7 +25,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 [compat]
 CategoricalArrays = "0.10.0"
 Compat = "3.17"
-DataAPI = "1.8"
+DataAPI = "1.9"
 InvertedIndices = "1"
 IteratorInterfaceExtensions = "0.1.1, 1"
 Missings = "0.4.2, 1"
diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md
@@ -26,7 +26,17 @@ The rules for a valid type of index into a column are the following:
     * a vector of `Bool` that has to be a subtype of `AbstractVector{Bool}`;
     * a regular expression, which gets expanded to a vector of matching column names;
     * a `Not` expression (see [InvertedIndices.jl](https://github.com/mbauman/InvertedIndices.jl));
-    * an `Cols`, `All` or `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl));
+      the `Not(idx)` selects all indices not in the passed `idx`;
+    * a `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl));
+      `Cols(idxs...)` selects the union of the selections in `idxs`; in particular
+      `Cols()` selects no columns and `Cols(:)` selects all columns; a special rule is
+      `Cols(predicate)`, where `predicate` is a predicate function; in this case
+      the columns whose names passed to `predicate` as strings return `true`
+      are selected.
+    * a `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl));
+       `Between(first, last)` selects the columns between `first` and `last`;
+    * an `All` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl));
+      `All()` selects all columns, equivalent to `:`;
     * a colon literal `:`.
 
 The rules for a valid type of index into a row are the following:
diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md
@@ -255,18 +255,49 @@ julia> df[!, Not(:x1)]
 Finally, you can use `Not`, `Between`, `Cols` and `All` selectors in more
 complex column selection scenarios (note that `Cols()` selects no columns while
 `All()` selects all columns therefore `Cols` is a preferred selector if you
-write generic code). The following examples move all columns whose names match
-`r"x"` regular expression respectively to the front and to the end of a data
-frame:
+write generic code). Here are examples of using each of these selectors:
 
-```
+```jldoctest dataframe
 julia> df = DataFrame(r=1, x1=2, x2=3, y=4)
 1×4 DataFrame
  Row │ r      x1     x2     y
      │ Int64  Int64  Int64  Int64
 ─────┼────────────────────────────
    1 │     1      2      3      4
 
+julia> df[:, Not(:r)] # drop :r column
+1×3 DataFrame
+ Row │ x1     x2     y
+     │ Int64  Int64  Int64
+─────┼─────────────────────
+   1 │     2      3      4
+
+julia> df[:, Between(:r, :x2)] # keep columns between :r and :x2
+1×3 DataFrame
+ Row │ r      x1     x2
+     │ Int64  Int64  Int64
+─────┼─────────────────────
+   1 │     1      2      3
+
+julia> df[:, All()] # keep all columns
+1×4 DataFrame
+ Row │ r      x1     x2     y
+     │ Int64  Int64  Int64  Int64
+─────┼────────────────────────────
+   1 │     1      2      3      4
+
+julia> df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts with "x"
+1×2 DataFrame
+ Row │ x1     x2
+     │ Int64  Int64
+─────┼──────────────
+   1 │     2      3
+```
+
+The following examples show a more complex use of the `Cols` selector, which moves all
+columns whose names match `r"x"` regular expression respectively to the front
+and to the end of the data frame:
+```jldoctest dataframe
 julia> df[:, Cols(r"x", :)]
 1×4 DataFrame
  Row │ x1     x2     r      y
diff --git a/src/other/index.jl b/src/other/index.jl
@@ -221,6 +221,9 @@ end
     isempty(idx.cols) ? (1:length(x)) : throw(ArgumentError("All(args...) is not supported: use Cols(args...) instead"))
 @inline Base.getindex(x::AbstractIndex, idx::Cols) =
     isempty(idx.cols) ? Int[] : union(getindex.(Ref(x), idx.cols)...)
+@inline Base.getindex(x::AbstractIndex, idx::Cols{Tuple{typeof(:)}}) = x[:]
+@inline Base.getindex(x::AbstractIndex, idx::Cols{<:Tuple{Function}}) =
+    findall(idx.cols[1], names(x))
 
 @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer})
     if any(v -> v isa Bool, idx)
diff --git a/test/index.jl b/test/index.jl
@@ -474,6 +474,11 @@ end
     df = DataFrame(a1=1, a2=2, b1=3, b2=4)
     @test df[:, Cols(r"a", Not(r"1"))] == df[:, [1, 2, 4]]
     @test df[:, Cols(Not(r"1"), r"a")] == df[:, [2, 4, 1]]
+    @test df[:, Cols(x -> x[1] == 'a')] == df[:, [1, 2]]
+    @test df[:, Cols(x -> x[end] == '1')] == df[:, [1, 3]]
+    @test df[:, Cols(x -> x[end] == '3')] == DataFrame()
+    @test_throws MethodError df[:, Cols(x -> true, 1)] == DataFrame()
+    @test_throws MethodError df[:, Cols(1, x -> true)] == DataFrame()
 end
 
 @testset "views" begin