Skip to content

Commit d8add19

Browse files
authored
feature: unstack receives kwarg fillvalue (#2828)
1 parent 1a6fd15 commit d8add19

File tree

3 files changed

+111
-12
lines changed

3 files changed

+111
-12
lines changed

NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
(notably `PooledArray` and `CategoricalArray`) or when they contained only
2323
integers in a small range.
2424
([#2812](https://github.com/JuliaData/DataFrames.jl/pull/2812))
25+
* the `unstack` function receives new keyword argument `fill`
26+
(with `missing` default) that is used to fill combinations of not encountered
27+
rows and columns. This feature allows to distinguish between missings in
28+
value column and just missing row/column combinations and to easily fill
29+
with zeros non existing combinations in case of counting.
30+
([#2828](https://github.com/JuliaData/DataFrames.jl/pull/2828))
2531

2632
* Allow adding new columns to a `SubDataFrame` created with `:` as column selector
2733
([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)).

src/abstractdataframe/reshape.jl

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -198,11 +198,11 @@ end
198198

199199
"""
200200
unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity,
201-
allowmissing::Bool=false, allowduplicates::Bool=false)
201+
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
202202
unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity,
203-
allowmissing::Bool=false, allowduplicates::Bool=false)
203+
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
204204
unstack(df::AbstractDataFrame; renamecols::Function=identity,
205-
allowmissing::Bool=false, allowduplicates::Bool=false)
205+
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
206206
207207
Unstack data frame `df`, i.e. convert it from long to wide format.
208208
@@ -229,6 +229,10 @@ Row and column keys will be ordered in the order of their first appearance.
229229
- `allowduplicates`: if `false` (the default) then an error an error will be thrown
230230
if combination of `rowkeys` and `colkey` contains duplicate entries; if `true`
231231
then then the last encountered `value` will be retained.
232+
- `fill`: missing row/column combinations are filled with this value. The default
233+
is `missing`. If the `value` column is a `CategoricalVector` and `fill`
234+
is not `missing` then in order to keep unstacked value columns also
235+
`CategoricalVector` the `fill` must be passed as `CategoricalValue`
232236
233237
# Examples
234238
@@ -331,36 +335,55 @@ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x))
331335
4 │ 4 2.0 1.0 2.0
332336
5 │ 5 2.0 1.0 3.0
333337
6 │ 6 2.0 1.0 3.0
338+
339+
julia> df = DataFrame(id=["1", "1", "2"],
340+
variable=["Var1", "Var2", "Var1"],
341+
value=[1, 2, 3])
342+
3×3 DataFrame
343+
Row │ id variable value
344+
│ String String Int64
345+
─────┼─────────────────────────
346+
1 │ 1 Var1 1
347+
2 │ 1 Var2 2
348+
3 │ 2 Var1 3
349+
350+
julia> unstack(df, :variable, :value, fill=0)
351+
2×3 DataFrame
352+
Row │ id Var1 Var2
353+
│ String Int64 Int64
354+
─────┼──────────────────────
355+
1 │ 1 1 2
356+
2 │ 2 3 0
334357
```
335358
Note that there are some differences between the widened results above.
336359
"""
337360
function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
338361
value::ColumnIndex; renamecols::Function=identity,
339-
allowmissing::Bool=false, allowduplicates::Bool=false)
362+
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
340363
rowkey_ints = vcat(index(df)[rowkeys])
341364
@assert rowkey_ints isa AbstractVector{Int}
342365
length(rowkey_ints) == 0 && throw(ArgumentError("No key column found"))
343366
g_rowkey = groupby(df, rowkey_ints)
344367
g_colkey = groupby(df, colkey)
345368
valuecol = df[!, value]
346369
return _unstack(df, rowkey_ints, index(df)[colkey], g_colkey,
347-
valuecol, g_rowkey, renamecols, allowmissing, allowduplicates)
370+
valuecol, g_rowkey, renamecols, allowmissing, allowduplicates, fill)
348371
end
349372

350373
function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex;
351374
renamecols::Function=identity,
352-
allowmissing::Bool=false, allowduplicates::Bool=false)
375+
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
353376
colkey_int = index(df)[colkey]
354377
value_int = index(df)[value]
355378
return unstack(df, Not(colkey_int, value_int), colkey_int, value_int,
356379
renamecols=renamecols, allowmissing=allowmissing,
357-
allowduplicates=allowduplicates)
380+
allowduplicates=allowduplicates, fill=fill)
358381
end
359382

360383
unstack(df::AbstractDataFrame; renamecols::Function=identity,
361-
allowmissing::Bool=false, allowduplicates::Bool=false) =
384+
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) =
362385
unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
363-
allowduplicates=allowduplicates)
386+
allowduplicates=allowduplicates, fill=fill)
364387

365388
# we take into account the fact that idx, starts and ends are computed lazily
366389
# so we rather directly reference the gdf.groups
@@ -388,7 +411,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
388411
colkey::Int, g_colkey::GroupedDataFrame,
389412
valuecol::AbstractVector, g_rowkey::GroupedDataFrame,
390413
renamecols::Function,
391-
allowmissing::Bool, allowduplicates::Bool)
414+
allowmissing::Bool, allowduplicates::Bool, fill)
392415
rowref = g_rowkey.groups
393416
row_group_row_idxs = find_group_row(g_rowkey)
394417
Nrow = length(g_rowkey)
@@ -398,13 +421,15 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
398421
Ncol = length(g_colkey)
399422
col_group_row_idxs = find_group_row(g_colkey)
400423
colref_map = df[col_group_row_idxs, colkey]
401-
402424
if any(ismissing, colref_map) && !allowmissing
403425
throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " *
404426
"Pass `allowmissing=true` to skip missings."))
405427
end
428+
unstacked_val = [fill!(similar(valuecol,
429+
promote_type(eltype(valuecol), typeof(fill)),
430+
Nrow),
431+
fill) for _ in 1:Ncol]
406432

407-
unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
408433
mask_filled = falses(Nrow, Ncol)
409434

410435
@assert length(rowref) == length(colref) == length(valuecol)

test/reshape.jl

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,74 @@ end
654654
@test IndexStyle(DataFrames.StackedVector) == IndexLinear()
655655
end
656656

657+
@testset "unstack with fill" begin
658+
df = DataFrame(factory=["Fac1", "Fac1", "Fac2", "Fac2"],
659+
variable=["Var1", "Var2", "Var1", "Var2"],
660+
value=[1, 2, 3, 4])
661+
dfu1 = DataFrame(factory=["Fac1", "Fac2"],
662+
Var1=allowmissing([1, 3]),
663+
Var2=allowmissing([2, 4]))
664+
dfu = unstack(df, :variable, :value)
665+
@test dfu dfu1
666+
@test eltype(dfu.Var1) === Union{Missing, Int}
667+
@test eltype(dfu.Var2) === Union{Missing, Int}
668+
669+
for (sentinel, coleltype) in zip([1, 1., "1", nothing], [Int, Float64, Any, Union{Int, Nothing}])
670+
dfu = unstack(df, :variable, :value, fill=sentinel)
671+
@test dfu dfu1
672+
@test eltype(dfu.Var1) === coleltype
673+
@test eltype(dfu.Var2) === coleltype
674+
end
675+
676+
df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
677+
variable=["Var1", "Var2", "Var1"],
678+
value=[1, 2, 3])
679+
for (sentinel, coleltype) in zip([1, 1.0, "1", nothing], [Int, Float64, Any, Union{Int, Nothing}])
680+
dfu = unstack(df, :variable, :value, fill=sentinel)
681+
@test dfu.Var1 == [1, 3]
682+
@test eltype(dfu.Var1) === coleltype
683+
@test dfu.Var2 == [2, sentinel]
684+
@test eltype(dfu.Var2) === coleltype
685+
end
686+
687+
df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
688+
variable=["Var1", "Var2", "Var1"],
689+
value=categorical([1, 2, 3], ordered=true))
690+
# categorical is dropped here
691+
for (sentinel, coleltype) in zip([0, 0.0, "", nothing], [Int, Float64, Any, Union{Int, Nothing}])
692+
dfu = unstack(df, :variable, :value, fill=sentinel)
693+
@test dfu.Var1 == [1, 3]
694+
@test typeof(dfu.Var1) === Vector{coleltype}
695+
@test dfu.Var2 == [2, sentinel]
696+
@test typeof(dfu.Var2) === Vector{coleltype}
697+
end
698+
# categorical is kept here
699+
for (sentinel, coleltype) in zip([missing, CategoricalValue(1, df.value), ], [Union{Int, Missing}, Int])
700+
dfu = unstack(df, :variable, :value, fill=sentinel)
701+
@test dfu.Var1 == [1, 3]
702+
@test typeof(dfu.Var1) <: CategoricalVector{coleltype}
703+
@test dfu.Var2 [2, sentinel]
704+
@test typeof(dfu.Var2) <: CategoricalVector{coleltype}
705+
@test levels(dfu.Var1) == levels(dfu.Var2) == levels(df.value)
706+
end
707+
708+
df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
709+
variable=["Var1", "Var2", "Var1"],
710+
value=categorical([1, 2, 3]))
711+
dfu = unstack(df, :variable, :value, fill=CategoricalValue(0, categorical([0])))
712+
@test dfu.Var1 == [1, 3]
713+
@test typeof(dfu.Var1) <: CategoricalVector{Int}
714+
@test dfu.Var2 [2, 0]
715+
@test typeof(dfu.Var2) <: CategoricalVector{Int}
716+
@test levels(dfu.Var1) == levels(dfu.Var2) == 0:3
717+
dfu = unstack(df, :variable, :value, fill=CategoricalValue("0", categorical(["0"])))
718+
@test dfu.Var1 == [1, 3]
719+
@test typeof(dfu.Var1) <: CategoricalVector{Union{Int,String}}
720+
@test dfu.Var2 [2, "0"]
721+
@test typeof(dfu.Var2) <: CategoricalVector{Union{Int,String}}
722+
@test levels(dfu.Var1) == levels(dfu.Var2) == ["0"; 1:3]
723+
end
724+
657725
@testset "empty unstack" begin
658726
df = DataFrame(a = [], b = [], c = [])
659727
dfu = unstack(df, :b, :c)

0 commit comments

Comments
 (0)