rename valuestransform to combine in unstack (#3185)

bkamins · web-flow · commit 57b560b71f0b · 2022-10-03T08:25:30.000+02:00
diff --git a/NEWS.md b/NEWS.md
@@ -18,9 +18,10 @@
   for a more flexible handling of values stored in a column that will
   become a new header
   ([#3004](https://github.com/JuliaData/DataFrames.jl/issues/3004))
-* `unstack` now allows passing a function in `valuestransform` keyword argument;
+* `unstack` now allows passing a function in `combine` keyword argument;
   this allows for a convenient creation of two dimensional pivot tables
-  ([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998))
+  ([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998),
+   [#3185](https://github.com/JuliaData/DataFrames.jl/pull/3185))
 * `filter` for `GroupedDataFrame` now accepts `ungroup` keyword argument
   ([#3021](https://github.com/JuliaData/DataFrames.jl/issues/3021))
 * Add special syntax for `eachindex`, `groupindices`, and `proprow`
@@ -65,6 +66,12 @@
   or older it is an in place operation.
   ([#3022](https://github.com/JuliaData/DataFrames.jl/pull/3022))
 
+# Deprecations
+
+* `allowduplicates` keyword argument in `unstack` is deprecated,
+  `combine` keyword argument should be used instead
+  ([#3185](https://github.com/JuliaData/DataFrames.jl/pull/3185))
+
 ## Internal changes
 
 * `DataFrame` is now a `mutable struct` and has three new fields
diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md
@@ -297,7 +297,7 @@ Id columns -- `RepeatedVector`
 This repeats the original columns N times where N is the number of columns stacked.
 
 To do aggregation, use the split-apply-combine functions in combination with
-`unstack` or use the `valuestransform` keyword argument in `unstack`. Here is an example:
+`unstack` or use the `combine` keyword argument in `unstack`. Here is an example:
 
 ```jldoctest reshape
 julia> using Statistics
@@ -357,7 +357,7 @@ julia> unstack(agg, :variable, :Species, :vmean)
    4 │ PetalWidth         0.244            1.326           2.026
    5 │ id                25.5             75.5           125.5
 
-julia> unstack(d, :variable, :Species, :value, valuestransform=mean)
+julia> unstack(d, :variable, :Species, :value, combine=mean)
 5×4 DataFrame
  Row │ variable     Iris-setosa  Iris-versicolor  Iris-virginica
      │ String       Float64?     Float64?         Float64?
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -215,20 +215,17 @@ end
 """
     unstack(df::AbstractDataFrame, rowkeys, colkey, value;
             renamecols::Function=identity, allowmissing::Bool=false,
-            allowduplicates::Bool=false, valuestransform=nothing,
-            fill=missing, threads::Bool=true)
+            combine=nothing, fill=missing, threads::Bool=true)
     unstack(df::AbstractDataFrame, colkey, value;
             renamecols::Function=identity, allowmissing::Bool=false,
-            allowduplicates::Bool=false, valuestransform=nothing,
-            fill=missing, threads::Bool=true)
+            combine=nothing, fill=missing, threads::Bool=true)
     unstack(df::AbstractDataFrame;
             renamecols::Function=identity, allowmissing::Bool=false,
-            allowduplicates::Bool=false, valuestransform=nothing,
-            fill=missing, threads::Bool=true)
+            combine=nothing, fill=missing, threads::Bool=true)
 
 Unstack data frame `df`, i.e. convert it from long to wide format.
 
-Row and column keys will be ordered in the order of their first appearance.
+Row and column keys are ordered in the order of their first appearance.
 
 # Positional arguments
 - `df` : the AbstractDataFrame to be unstacked
@@ -246,27 +243,31 @@ Row and column keys will be ordered in the order of their first appearance.
   return the name of the column to be created (typically as a string or a
   `Symbol`). Duplicates in resulting names when converted to `Symbol` are not
   allowed. By default no transformation is performed.
-- `allowmissing`: if `false` (the default) then an error will be thrown if
+- `allowmissing`: if `false` (the default) then an error is thrown if
   `colkey` contains `missing` values; if `true` then a column referring to
-  `missing` value will be created.
-- `allowduplicates`: if `false` (the default) then an error an error will be
-  thrown if combination of `rowkeys` and `colkey` contains duplicate entries; if
-  `true` then the last encountered `value` will be retained;
-  this keyword argument is ignored if `valuestransform` keyword argument is passed.
-- `valuestransform`: if passed then `allowduplicates` is ignored and instead
-   the passed function will be called on a vector view containing all elements
-   for each combination of `rowkeys` and `colkey` present in the data.
+  `missing` value is created.
+- `combine`: if `only` (the default) then an error is thrown if combination
+  of `rowkeys` and `colkey` contains duplicate entries. Otherwise the passed
+  value must be a function that is called on a vector view containing all
+  elements for each combination of `rowkeys` and `colkey` present in the data.
 - `fill`: missing row/column combinations are filled with this value. The
   default is `missing`. If the `value` column is a `CategoricalVector` and
   `fill` is not `missing` then in order to keep unstacked value columns also
   `CategoricalVector` the `fill` must be passed as `CategoricalValue`
-- `threads`: whether `valuestransform` may be run in separate tasks which
-  can execute in parallel (possibly being applied to multiple groups at the same time).
-  Whether or not tasks are actually spawned and their number are determined automatically.
-  Set to `false` if `valuestransform` requires serial execution or is not thread-safe.
+- `threads`: whether `combine` function may be run in separate tasks which can
+  execute in parallel (possibly being applied to multiple groups at the same
+  time). Whether or not tasks are actually spawned and their number are
+  determined automatically. Set to `false` if `combine` requires serial
+  execution or is not thread-safe.
 
-Metadata: table-level `:note`-style metadata and column-level `:note`-style metadata
-for row keys columns are preserved.
+Metadata: table-level `:note`-style metadata and column-level `:note`-style
+metadata for row keys columns are preserved.
+
+# Deprecations
+
+- `allowduplicates` keyword argument is deprecated; instead use `combine`
+  keyword argument; an equivalent to `allowduplicates=true` is `combine=last`
+  and to `allowduplicates=false` is `combine=only` (the default);
 
 # Examples
 
@@ -401,14 +402,14 @@ julia> df = DataFrame(cols=["a", "a", "b"], values=[1, 2, 4])
    2 │ a            2
    3 │ b            4
 
-julia> unstack(df, :cols, :values, valuestransform=copy)
+julia> unstack(df, :cols, :values, combine=copy)
 1×2 DataFrame
  Row │ a        b
      │ Array…?  Array…?
 ─────┼──────────────────
    1 │ [1, 2]   [4]
 
-julia> unstack(df, :cols, :values, valuestransform=sum)
+julia> unstack(df, :cols, :values, combine=sum)
 1×2 DataFrame
  Row │ a       b
      │ Int64?  Int64?
@@ -418,17 +419,21 @@ julia> unstack(df, :cols, :values, valuestransform=sum)
 """
 function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
                  values::ColumnIndex; renamecols::Function=identity,
-                 allowmissing::Bool=false, allowduplicates::Bool=false,
-                 valuestransform=nothing, fill=missing,
-                 threads::Bool=true)
+                 allowmissing::Bool=false,  allowduplicates::Bool=false,
+                 combine=only, fill=missing, threads::Bool=true)
+    if allowduplicates
+        Base.depwarn("allowduplicates keyword argument is deprecated. " *
+                     "Pass `combine=last` instead of `allowduplicates=true`.", :unstack)
+        combine = last
+    end
     # first make sure that rowkeys are unique and
     # normalize all selectors as a strings
     # if some of the selectors are wrong we will get an early error here
     rowkeys = names(df, index(df)[rowkeys])
     colkey = only(names(df, colkey))
     values = only(names(df, values))
 
-    if !isnothing(valuestransform)
+    if combine !== only
         # potentially colkey can be also part of rowkeys so we need to do unique
         groupcols = unique!([rowkeys; colkey])
         @assert groupcols isa Vector{String}
@@ -441,60 +446,67 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
         end
 
         gdf = groupby(df, groupcols)
-        if check_aggregate(valuestransform, df[!, values]) isa AbstractAggregate
-            # if valuestransform function is AbstractAggregate
+        if check_aggregate(combine, df[!, values]) isa AbstractAggregate
+            # if combine function is AbstractAggregate
             # then we are sure it will return a scalar number so we can
             # leave it as is and be sure we use fast path in combine
-            agg_fun = valuestransform
+            agg_fun = combine
         else
-            # in general valuestransform function could return e.g. a vector,
+            # in general combine function could return e.g. a vector,
             # which would get expanded to multiple rows so we protect it with
             # Ref that will get unwrapped by combine
-            agg_fun = Ref∘valuestransform
+            agg_fun = Ref∘combine
         end
-        df_op = combine(gdf, values => agg_fun => values_out,
-                        threads=threads)
+        df_op = DataFrames.combine(gdf, values => agg_fun => values_out,
+                                   threads=threads)
 
         group_rows = find_group_row(gdf)
         if !issorted(group_rows)
             df_op = df_op[sortperm(group_rows), :]
         end
-        # set allowduplicates to true as we should not have any duplicates now
-        # and allowduplicates=true is a bit faster
-        allowduplicates = true
+        # we should not have any duplicates in df_op now
+        noduplicates = true
     else
         df_op = df
         values_out = values
+        noduplicates = false
     end
 
     g_rowkey = groupby(df_op, rowkeys)
     g_colkey = groupby(df_op, colkey)
     valuecol = df_op[!, values_out]
     return _unstack(df_op, index(df_op)[rowkeys], index(df_op)[colkey], g_colkey,
-                    valuecol, g_rowkey, renamecols,
-                    allowmissing, allowduplicates, fill)
+                    valuecol, g_rowkey, renamecols, allowmissing, noduplicates, fill)
 end
 
 function unstack(df::AbstractDataFrame, colkey::ColumnIndex, values::ColumnIndex;
-                 renamecols::Function=identity,
-                 allowmissing::Bool=false, allowduplicates::Bool=false,
-                 valuestransform=nothing, fill=missing,
-                 threads::Bool=true)
+                 renamecols::Function=identity, allowmissing::Bool=false,
+                  allowduplicates::Bool=false, combine=only, fill=missing,
+                  threads::Bool=true)
+    if allowduplicates
+        Base.depwarn("allowduplicates keyword argument is deprecated. " *
+                     "Pass `combine=last` instead of allowduplicates=true.", :unstack)
+        combine = last
+    end
     colkey_int = index(df)[colkey]
     value_int = index(df)[values]
     return unstack(df, Not(colkey_int, value_int), colkey_int, value_int,
             renamecols=renamecols, allowmissing=allowmissing,
-            allowduplicates=allowduplicates, valuestransform=valuestransform,
+            combine=combine,
             fill=fill, threads=threads)
 end
 
-unstack(df::AbstractDataFrame; renamecols::Function=identity,
-        allowmissing::Bool=false, allowduplicates::Bool=false,
-        valuestransform=nothing, fill=missing,
-        threads::Bool=true) =
+function unstack(df::AbstractDataFrame; renamecols::Function=identity,
+                 allowmissing::Bool=false, allowduplicates::Bool=false,
+                 combine=only, fill=missing, threads::Bool=true)
+    if allowduplicates
+        Base.depwarn("allowduplicates keyword argument is deprecated. " *
+                     "Pass `combine=last` instead of allowduplicates=true.", :unstack)
+        combine = last
+    end
     unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
-            allowduplicates=allowduplicates, valuestransform=valuestransform,
-            fill=fill, threads=threads)
+            combine=combine, fill=fill, threads=threads)
+end
 
 # we take into account the fact that idx, starts and ends are computed lazily
 # so we rather directly reference the gdf.groups
@@ -521,8 +533,7 @@ end
 function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
                   colkey::Int, g_colkey::GroupedDataFrame,
                   valuecol::AbstractVector, g_rowkey::GroupedDataFrame,
-                  renamecols::Function, allowmissing::Bool,
-                  allowduplicates::Bool, fill)
+                  renamecols::Function, allowmissing::Bool, noduplicates::Bool, fill)
     rowref = g_rowkey.groups
     row_group_row_idxs = find_group_row(g_rowkey)
     Nrow = length(g_rowkey)
@@ -543,8 +554,8 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
                            Nrow),
                      fill) for _ in 1:Ncol]
 
-    # use a separate path for allowduplicates to reduce memory use and increase speed
-    if allowduplicates
+    # use a separate path for noduplicates to reduce memory use and increase speed
+    if noduplicates
         for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol))
             unstacked_val[col_id][row_id] = val
         end
@@ -556,7 +567,8 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
                 bad_var = colref_map[col_id]
                 throw(ArgumentError("Duplicate entries in unstack at row $k for key "*
                                     "$bad_key and variable $bad_var. " *
-                                    "Pass allowduplicates=true to allow them."))
+                                    "Pass `combine` keyword argument to specify " *
+                                    "how they should be handled."))
             end
             unstacked_val[col_id][row_id] = val
             mask_filled[row_id, col_id] = true
diff --git a/test/deprecated.jl b/test/deprecated.jl
@@ -140,4 +140,19 @@ end
     @test df == DataFrame(x=1, y=1:4)
 end
 
+@testset "deprecated allowduplicates in unstack" begin
+    df = DataFrame(row=[1, 1, 2, 2], variable=["x", "x", "y", "y"], value=1:4)
+    @test_throws ArgumentError unstack(df, :row, :variable, :value)
+    @test unstack(df, :row, :variable, :value, allowduplicates=true) ≅
+          DataFrame(row=1:2, x=[2, missing], y=[missing, 4])
+    @test unstack(df, :variable, :value, allowduplicates=true) ≅
+          DataFrame(row=1:2, x=[2, missing], y=[missing, 4])
+    @test unstack(df, allowduplicates=true) ≅
+          DataFrame(row=1:2, x=[2, missing], y=[missing, 4])
+    @test unstack(df, :variable, :value, allowduplicates=true) ≅
+          DataFrame(row=1:2, x=[2, missing], y=[missing, 4])
+    @test unstack(df, :row, :variable, :value, allowduplicates=true) ≅
+          unstack(df, :row, :variable, :value, combine=last)
+end
+
 end # module
diff --git a/test/metadata.jl b/test/metadata.jl
@@ -1327,7 +1327,7 @@ end
     @test check_allnotemetadata(res)
     @test getfield(res, :metadata) === nothing
     @test getfield(res, :colmetadata) === nothing
-    res = unstack(long, :a, :variable, :value, valuestransform=copy)
+    res = unstack(long, :a, :variable, :value, combine=copy)
     @test check_allnotemetadata(res)
     @test getfield(res, :metadata) === nothing
     @test getfield(res, :colmetadata) === nothing
@@ -1361,7 +1361,7 @@ end
     @test isempty(colmetadatakeys(res, :c))
     @test isempty(colmetadatakeys(res, :d))
 
-    res = unstack(long, :a, :variable, :value, valuestransform=copy)
+    res = unstack(long, :a, :variable, :value, combine=copy)
     @test check_allnotemetadata(res)
     @test collect(metadatakeys(res)) == ["name"]
     @test metadata(res, "name") == "empty"
diff --git a/test/multithreading.jl b/test/multithreading.jl
@@ -236,16 +236,13 @@ end
     l = Ref(0)
     m = Ref(0)
     n = Ref(0)
-    unstack(df,
-            allowduplicates=true, valuestransform=x -> (l[] += 1),
+    unstack(df, combine=x -> (l[] += 1),
             threads=false) ==
             DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) ==
-    unstack(df, :variable, :value,
-            allowduplicates=true, valuestransform=x -> (m[] += 1),
+    unstack(df, :variable, :value, combine=x -> (m[] += 1),
             threads=false) ==
             DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) ==
-    unstack(df, :id, :variable, :value,
-            allowduplicates=true, valuestransform=x -> (n[] += 1),
+    unstack(df, :id, :variable, :value, combine=x -> (n[] += 1),
             threads=false) ==
             DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6])
 
diff --git a/test/reshape.jl b/test/reshape.jl