feature: unstack receives kwarg fillvalue (#2828)

pstorozenko · web-flow · commit d8add19769a2 · 2021-09-08T08:13:06.000+02:00
diff --git a/NEWS.md b/NEWS.md
@@ -22,6 +22,12 @@
   (notably `PooledArray` and `CategoricalArray`) or when they contained only
   integers in a small range.
   ([#2812](https://github.com/JuliaData/DataFrames.jl/pull/2812))
+* the `unstack` function receives new keyword argument `fill`
+  (with `missing` default) that is used to fill combinations of not encountered
+  rows and columns. This feature allows to distinguish between missings in
+  value column and just missing row/column combinations and to easily fill
+  with zeros non existing combinations in case of counting.
+  ([#2828](https://github.com/JuliaData/DataFrames.jl/pull/2828))
 
 * Allow adding new columns to a `SubDataFrame` created with `:` as column selector
   ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)).
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -198,11 +198,11 @@ end
 
 """
     unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity,
-            allowmissing::Bool=false, allowduplicates::Bool=false)
+            allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
     unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity,
-            allowmissing::Bool=false, allowduplicates::Bool=false)
+            allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
     unstack(df::AbstractDataFrame; renamecols::Function=identity,
-            allowmissing::Bool=false, allowduplicates::Bool=false)
+            allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
 
 Unstack data frame `df`, i.e. convert it from long to wide format.
 
@@ -229,6 +229,10 @@ Row and column keys will be ordered in the order of their first appearance.
 - `allowduplicates`: if `false` (the default) then an error an error will be thrown
   if combination of `rowkeys` and `colkey` contains duplicate entries; if `true`
   then  then the last encountered `value` will be retained.
+- `fill`: missing row/column combinations are filled with this value. The default
+  is `missing`. If the `value` column is a `CategoricalVector` and `fill`
+  is not `missing` then in order to keep unstacked value columns also
+  `CategoricalVector` the `fill` must be passed as `CategoricalValue`
 
 # Examples
 
@@ -331,36 +335,55 @@ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x))
    4 │     4       2.0       1.0       2.0
    5 │     5       2.0       1.0       3.0
    6 │     6       2.0       1.0       3.0
+
+julia> df = DataFrame(id=["1", "1", "2"],
+                      variable=["Var1", "Var2", "Var1"],
+                      value=[1, 2, 3])
+3×3 DataFrame
+ Row │ id      variable  value
+     │ String  String    Int64
+─────┼─────────────────────────
+   1 │ 1       Var1          1
+   2 │ 1       Var2          2
+   3 │ 2       Var1          3
+
+julia> unstack(df, :variable, :value, fill=0)
+2×3 DataFrame
+ Row │ id      Var1   Var2
+     │ String  Int64  Int64
+─────┼──────────────────────
+   1 │ 1           1      2
+   2 │ 2           3      0
 ```
 Note that there are some differences between the widened results above.
 """
 function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
                  value::ColumnIndex; renamecols::Function=identity,
-                 allowmissing::Bool=false, allowduplicates::Bool=false)
+                 allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
     rowkey_ints = vcat(index(df)[rowkeys])
     @assert rowkey_ints isa AbstractVector{Int}
     length(rowkey_ints) == 0 && throw(ArgumentError("No key column found"))
     g_rowkey = groupby(df, rowkey_ints)
     g_colkey = groupby(df, colkey)
     valuecol = df[!, value]
     return _unstack(df, rowkey_ints, index(df)[colkey], g_colkey,
-                    valuecol, g_rowkey, renamecols, allowmissing, allowduplicates)
+                    valuecol, g_rowkey, renamecols, allowmissing, allowduplicates, fill)
 end
 
 function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex;
                  renamecols::Function=identity,
-                 allowmissing::Bool=false, allowduplicates::Bool=false)
+                 allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
     colkey_int = index(df)[colkey]
     value_int = index(df)[value]
     return unstack(df, Not(colkey_int, value_int), colkey_int, value_int,
             renamecols=renamecols, allowmissing=allowmissing,
-            allowduplicates=allowduplicates)
+            allowduplicates=allowduplicates, fill=fill)
 end
 
 unstack(df::AbstractDataFrame; renamecols::Function=identity,
-        allowmissing::Bool=false, allowduplicates::Bool=false) =
+        allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) =
     unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
-            allowduplicates=allowduplicates)
+            allowduplicates=allowduplicates, fill=fill)
 
 # we take into account the fact that idx, starts and ends are computed lazily
 # so we rather directly reference the gdf.groups
@@ -388,7 +411,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
                   colkey::Int, g_colkey::GroupedDataFrame,
                   valuecol::AbstractVector, g_rowkey::GroupedDataFrame,
                   renamecols::Function,
-                  allowmissing::Bool, allowduplicates::Bool)
+                  allowmissing::Bool, allowduplicates::Bool, fill)
     rowref = g_rowkey.groups
     row_group_row_idxs = find_group_row(g_rowkey)
     Nrow = length(g_rowkey)
@@ -398,13 +421,15 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
     Ncol = length(g_colkey)
     col_group_row_idxs = find_group_row(g_colkey)
     colref_map = df[col_group_row_idxs, colkey]
-
     if any(ismissing, colref_map) && !allowmissing
         throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " *
                             "Pass `allowmissing=true` to skip missings."))
     end
+    unstacked_val = [fill!(similar(valuecol,
+                                   promote_type(eltype(valuecol), typeof(fill)),
+                                   Nrow),
+                           fill) for _ in 1:Ncol]
 
-    unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
     mask_filled = falses(Nrow, Ncol)
 
     @assert length(rowref) == length(colref) == length(valuecol)
diff --git a/test/reshape.jl b/test/reshape.jl
@@ -654,6 +654,74 @@ end
     @test IndexStyle(DataFrames.StackedVector) == IndexLinear()
 end
 
+@testset "unstack with fill" begin
+    df = DataFrame(factory=["Fac1", "Fac1", "Fac2", "Fac2"],
+                   variable=["Var1", "Var2", "Var1", "Var2"],
+                   value=[1, 2, 3, 4])
+    dfu1 = DataFrame(factory=["Fac1", "Fac2"],
+                     Var1=allowmissing([1, 3]),
+                     Var2=allowmissing([2, 4]))
+    dfu = unstack(df, :variable, :value)
+    @test dfu ≅ dfu1
+    @test eltype(dfu.Var1) === Union{Missing, Int}
+    @test eltype(dfu.Var2) === Union{Missing, Int}
+
+    for (sentinel, coleltype) in zip([1, 1., "1", nothing], [Int, Float64, Any, Union{Int, Nothing}])
+        dfu = unstack(df, :variable, :value, fill=sentinel)
+        @test dfu ≅ dfu1
+        @test eltype(dfu.Var1) === coleltype
+        @test eltype(dfu.Var2) === coleltype
+    end
+
+    df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
+                   variable=["Var1", "Var2", "Var1"],
+                   value=[1, 2, 3])
+    for (sentinel, coleltype) in zip([1, 1.0, "1", nothing], [Int, Float64, Any, Union{Int, Nothing}])
+        dfu = unstack(df, :variable, :value, fill=sentinel)
+        @test dfu.Var1 == [1, 3]
+        @test eltype(dfu.Var1) === coleltype
+        @test dfu.Var2 == [2, sentinel]
+        @test eltype(dfu.Var2) === coleltype
+    end
+
+    df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
+                   variable=["Var1", "Var2", "Var1"],
+                   value=categorical([1, 2, 3], ordered=true))
+    # categorical is dropped here
+    for (sentinel, coleltype) in zip([0, 0.0, "", nothing], [Int, Float64, Any, Union{Int, Nothing}])
+        dfu = unstack(df, :variable, :value, fill=sentinel)
+        @test dfu.Var1 == [1, 3]
+        @test typeof(dfu.Var1) === Vector{coleltype}
+        @test dfu.Var2 == [2, sentinel]
+        @test typeof(dfu.Var2) === Vector{coleltype}
+    end
+    # categorical is kept here
+    for (sentinel, coleltype) in zip([missing, CategoricalValue(1, df.value), ], [Union{Int, Missing}, Int])
+        dfu = unstack(df, :variable, :value, fill=sentinel)
+        @test dfu.Var1 == [1, 3]
+        @test typeof(dfu.Var1) <: CategoricalVector{coleltype}
+        @test dfu.Var2 ≅ [2, sentinel]
+        @test typeof(dfu.Var2) <: CategoricalVector{coleltype}
+        @test levels(dfu.Var1) == levels(dfu.Var2) == levels(df.value)
+    end
+
+    df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
+                   variable=["Var1", "Var2", "Var1"],
+                   value=categorical([1, 2, 3]))
+    dfu = unstack(df, :variable, :value, fill=CategoricalValue(0, categorical([0])))
+    @test dfu.Var1 == [1, 3]
+    @test typeof(dfu.Var1) <: CategoricalVector{Int}
+    @test dfu.Var2 ≅ [2, 0]
+    @test typeof(dfu.Var2) <: CategoricalVector{Int}
+    @test levels(dfu.Var1) == levels(dfu.Var2) == 0:3
+    dfu = unstack(df, :variable, :value, fill=CategoricalValue("0", categorical(["0"])))
+    @test dfu.Var1 == [1, 3]
+    @test typeof(dfu.Var1) <: CategoricalVector{Union{Int,String}}
+    @test dfu.Var2 ≅ [2, "0"]
+    @test typeof(dfu.Var2) <: CategoricalVector{Union{Int,String}}
+    @test levels(dfu.Var1) == levels(dfu.Var2) == ["0"; 1:3]
+end
+
 @testset "empty unstack" begin
     df = DataFrame(a = [], b = [], c = [])
     dfu = unstack(df, :b, :c)