Skip to content

Commit ba36297

Browse files
authored
allow transformation destination to be a function (#2897)
1 parent e7acc97 commit ba36297

File tree

4 files changed

+79
-3
lines changed

4 files changed

+79
-3
lines changed

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@
7272
([#2859](https://github.com/JuliaData/DataFrames.jl/pull/2859))
7373
* `Cols` now accepts a predicate accepting column names as strings.
7474
([#2881](https://github.com/JuliaData/DataFrames.jl/pull/2881))
75+
* In `source => transformation => destination` transformation specification
76+
minilanguage now `destination` can be also a `Function` generating
77+
target column names and taking column names specified by `source`
78+
as an argument.
79+
([#2897](https://github.com/JuliaData/DataFrames.jl/pull/2897))
7580

7681
## Bug fixes
7782

docs/src/man/split_apply_combine.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,11 @@ each subset of the `DataFrame`. This specification can be of the following forms
5151
`function` returns a single value or a vector; the generated name is created by
5252
concatenating source column name and `function` name by default (see examples below).
5353
3. a `cols => function => target_cols` form additionally explicitly specifying
54-
the target column or columns.
54+
the target column or columns, which must be a single name (as a `Symbol` or a string),
55+
a vector of names or `AsTable`. Additionally it can be a `Function` which
56+
takes a string or a vector of strings as an argument containing names of columns
57+
selected by `cols`, and returns the target columns names (all accepted types
58+
except `AsTable` are allowed).
5559
4. a `col => target_cols` pair, which renames the column `col` to `target_cols`, which
5660
must be single name (as a `Symbol` or a string), a vector of names or `AsTable`.
5761
5. a `nrow` or `nrow => target_cols` form which efficiently computes the number of rows

src/abstractdataframe/selection.jl

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,11 @@ const TRANSFORMATION_COMMON_RULES =
4545
`function` returns a single value or a vector; the generated name is created by
4646
concatenating source column name and `function` name by default (see examples below).
4747
3. a `cols => function => target_cols` form additionally explicitly specifying
48-
the target column or columns.
48+
the target column or columns, which must be a single name (as a `Symbol` or a string),
49+
a vector of names or `AsTable`. Additionally it can be a `Function` which
50+
takes a string or a vector of strings as an argument containing names of columns
51+
selected by `cols`, and returns the target columns names (all accepted types
52+
except `AsTable` are allowed).
4953
4. a `col => target_cols` pair, which renames the column `col` to `target_cols`, which
5054
must be single name (as a `Symbol` or a string), a vector of names or `AsTable`.
5155
5. a `nrow` or `nrow => target_cols` form which efficiently computes the number of rows
@@ -244,7 +248,8 @@ function normalize_selection(idx::AbstractIndex,
244248
<:Pair{<:Base.Callable,
245249
<:Union{Symbol, AbstractString, DataType,
246250
AbstractVector{Symbol},
247-
AbstractVector{<:AbstractString}}}}),
251+
AbstractVector{<:AbstractString},
252+
Function}}}),
248253
renamecols::Bool)
249254
lls = last(last(sel))
250255

@@ -276,6 +281,20 @@ function normalize_selection(idx::AbstractIndex,
276281
end
277282
end
278283

284+
if lls isa Function
285+
fun_colnames = _names(idx)[c]
286+
# if AsTable was used as source we always treat it as multicolumn selector
287+
if wanttable && fun_colnames isa Symbol
288+
fun_colnames = [fun_colnames]
289+
end
290+
lls = lls(string.(fun_colnames))
291+
if !(lls isa Union{Symbol, AbstractString, AbstractVector{Symbol},
292+
AbstractVector{<:AbstractString}})
293+
throw(ArgumentError("function producing target column names must " *
294+
"return a Symbol, a string, a vector of Symbols " *
295+
"or a vector of strings"))
296+
end
297+
end
279298
if lls isa AbstractString
280299
combine_target_col = Symbol(lls)
281300
elseif lls isa AbstractVector{<:AbstractString}
@@ -798,6 +817,15 @@ julia> select(df, AsTable(:) => ByRow(mean), renamecols=false)
798817
2 │ 3.5
799818
3 │ 4.5
800819
820+
julia> select(df, AsTable(:) => ByRow(mean) => x -> join(x, "_"))
821+
3×1 DataFrame
822+
Row │ a_b
823+
│ Float64
824+
─────┼─────────
825+
1 │ 2.5
826+
2 │ 3.5
827+
3 │ 4.5
828+
801829
julia> select(first, df)
802830
3×2 DataFrame
803831
Row │ a b
@@ -1125,6 +1153,15 @@ julia> combine(df, AsTable(:) => ByRow(mean), renamecols=false)
11251153
2 │ 3.5
11261154
3 │ 4.5
11271155
1156+
julia> combine(df, AsTable(:) => ByRow(mean) => x -> join(x, "_"))
1157+
3×1 DataFrame
1158+
Row │ a_b
1159+
│ Float64
1160+
─────┼─────────
1161+
1 │ 2.5
1162+
2 │ 3.5
1163+
3 │ 4.5
1164+
11281165
julia> combine(first, df)
11291166
1×2 DataFrame
11301167
Row │ a b

test/select.jl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1692,4 +1692,34 @@ end
16921692
@test_throws ArgumentError select(sdf, [:x => length => :a, 1 => :b], copycols=false)
16931693
end
16941694

1695+
@testset "function as target column names specifier" begin
1696+
df_ref = DataFrame(x=[[1, 2], [3, 4]], id=1:2)
1697+
for v in (df_ref, groupby(df_ref, :id))
1698+
@test select(v, :id, :x => ByRow(first) => identity) == DataFrame(id=1:2, x=[1, 3])
1699+
@test select(v, :id, "x" => ByRow(first) => identity) == DataFrame(id=1:2, x=[1, 3])
1700+
@test select(v, :id, 1 => ByRow(first) => identity) == DataFrame(id=1:2, x=[1, 3])
1701+
@test select(v, :id, 1 => ByRow(first) => uppercase) == DataFrame(id=1:2, X=[1, 3])
1702+
@test select(v, :id, 1 => ByRow(first) => string) == DataFrame(id=1:2, x=[1, 3])
1703+
@test select(v, :id, 1 => ByRow(first) => x -> Symbol(x)) == DataFrame(id=1:2, x=[1, 3])
1704+
@test select(v, :id, 1 => identity => x -> ["p", "q"]) ==
1705+
DataFrame(id=1:2, p=[1, 3], q=[2, 4])
1706+
@test select(v, :id, 1 => identity => x -> [:p, :q]) ==
1707+
DataFrame(id=1:2, p=[1, 3], q=[2, 4])
1708+
@test_throws ArgumentError select(v, :id, 1 => identity => x -> [:p, "q"])
1709+
@test_throws ArgumentError select(v, :id, 1 => identity => x -> AsTable)
1710+
@test select(v, :id, AsTable(1) => first => string) ==
1711+
DataFrame("id" => 1:2, "[\"x\"]" => [[1, 2], [3, 4]])
1712+
@test select(v, :id, ["x", "x"] => ByRow((p,q) -> first(p)) => string) ==
1713+
DataFrame("id" => 1:2, "[\"x\", \"x\"]" => [1, 3])
1714+
@test select(v, :id, 1:2 => ((p, q) -> q) => x -> join(x, "_")) ==
1715+
DataFrame(id=1:2, x_id=1:2)
1716+
@test select(v, :id, AsTable(1:2) => last => x -> join(x, "_")) ==
1717+
DataFrame(id=1:2, x_id=1:2)
1718+
# we could make this work, but I skip it to keep the code simpler
1719+
# The problem is that Symbol and String are not Function
1720+
@test_throws ArgumentError select(v, :id, 1 => ByRow(first) => Symbol)
1721+
@test_throws ArgumentError select(v, :id, 1 => ByRow(first) => String)
1722+
end
1723+
end
1724+
16951725
end # module

0 commit comments

Comments
 (0)