@@ -198,11 +198,11 @@ end
198198
199199"""
200200 unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity,
201- allowmissing::Bool=false, allowduplicates::Bool=false)
201+ allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing )
202202 unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity,
203- allowmissing::Bool=false, allowduplicates::Bool=false)
203+ allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing )
204204 unstack(df::AbstractDataFrame; renamecols::Function=identity,
205- allowmissing::Bool=false, allowduplicates::Bool=false)
205+ allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing )
206206
207207Unstack data frame `df`, i.e. convert it from long to wide format.
208208
@@ -229,6 +229,10 @@ Row and column keys will be ordered in the order of their first appearance.
229229- `allowduplicates`: if `false` (the default) then an error an error will be thrown
230230 if combination of `rowkeys` and `colkey` contains duplicate entries; if `true`
231231 then then the last encountered `value` will be retained.
232+ - `fill`: missing row/column combinations are filled with this value. The default
233+ is `missing`. If the `value` column is a `CategoricalVector` and `fill`
234+ is not `missing` then in order to keep unstacked value columns also
235+ `CategoricalVector` the `fill` must be passed as `CategoricalValue`
232236
233237# Examples
234238
@@ -331,36 +335,55 @@ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x))
331335 4 │ 4 2.0 1.0 2.0
332336 5 │ 5 2.0 1.0 3.0
333337 6 │ 6 2.0 1.0 3.0
338+
339+ julia> df = DataFrame(id=["1", "1", "2"],
340+ variable=["Var1", "Var2", "Var1"],
341+ value=[1, 2, 3])
342+ 3×3 DataFrame
343+ Row │ id variable value
344+ │ String String Int64
345+ ─────┼─────────────────────────
346+ 1 │ 1 Var1 1
347+ 2 │ 1 Var2 2
348+ 3 │ 2 Var1 3
349+
350+ julia> unstack(df, :variable, :value, fill=0)
351+ 2×3 DataFrame
352+ Row │ id Var1 Var2
353+ │ String Int64 Int64
354+ ─────┼──────────────────────
355+ 1 │ 1 1 2
356+ 2 │ 2 3 0
334357```
335358Note that there are some differences between the widened results above.
336359"""
337360function unstack (df:: AbstractDataFrame , rowkeys, colkey:: ColumnIndex ,
338361 value:: ColumnIndex ; renamecols:: Function = identity,
339- allowmissing:: Bool = false , allowduplicates:: Bool = false )
362+ allowmissing:: Bool = false , allowduplicates:: Bool = false , fill = missing )
340363 rowkey_ints = vcat (index (df)[rowkeys])
341364 @assert rowkey_ints isa AbstractVector{Int}
342365 length (rowkey_ints) == 0 && throw (ArgumentError (" No key column found" ))
343366 g_rowkey = groupby (df, rowkey_ints)
344367 g_colkey = groupby (df, colkey)
345368 valuecol = df[! , value]
346369 return _unstack (df, rowkey_ints, index (df)[colkey], g_colkey,
347- valuecol, g_rowkey, renamecols, allowmissing, allowduplicates)
370+ valuecol, g_rowkey, renamecols, allowmissing, allowduplicates, fill )
348371end
349372
350373function unstack (df:: AbstractDataFrame , colkey:: ColumnIndex , value:: ColumnIndex ;
351374 renamecols:: Function = identity,
352- allowmissing:: Bool = false , allowduplicates:: Bool = false )
375+ allowmissing:: Bool = false , allowduplicates:: Bool = false , fill = missing )
353376 colkey_int = index (df)[colkey]
354377 value_int = index (df)[value]
355378 return unstack (df, Not (colkey_int, value_int), colkey_int, value_int,
356379 renamecols= renamecols, allowmissing= allowmissing,
357- allowduplicates= allowduplicates)
380+ allowduplicates= allowduplicates, fill = fill )
358381end
359382
360383unstack (df:: AbstractDataFrame ; renamecols:: Function = identity,
361- allowmissing:: Bool = false , allowduplicates:: Bool = false ) =
384+ allowmissing:: Bool = false , allowduplicates:: Bool = false , fill = missing ) =
362385 unstack (df, :variable , :value , renamecols= renamecols, allowmissing= allowmissing,
363- allowduplicates= allowduplicates)
386+ allowduplicates= allowduplicates, fill = fill )
364387
365388# we take into account the fact that idx, starts and ends are computed lazily
366389# so we rather directly reference the gdf.groups
@@ -388,7 +411,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
388411 colkey:: Int , g_colkey:: GroupedDataFrame ,
389412 valuecol:: AbstractVector , g_rowkey:: GroupedDataFrame ,
390413 renamecols:: Function ,
391- allowmissing:: Bool , allowduplicates:: Bool )
414+ allowmissing:: Bool , allowduplicates:: Bool , fill )
392415 rowref = g_rowkey. groups
393416 row_group_row_idxs = find_group_row (g_rowkey)
394417 Nrow = length (g_rowkey)
@@ -398,13 +421,15 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
398421 Ncol = length (g_colkey)
399422 col_group_row_idxs = find_group_row (g_colkey)
400423 colref_map = df[col_group_row_idxs, colkey]
401-
402424 if any (ismissing, colref_map) && ! allowmissing
403425 throw (ArgumentError (" Missing value in variable :$(_names (df)[colkey]) . " *
404426 " Pass `allowmissing=true` to skip missings." ))
405427 end
428+ unstacked_val = [fill! (similar (valuecol,
429+ promote_type (eltype (valuecol), typeof (fill)),
430+ Nrow),
431+ fill) for _ in 1 : Ncol]
406432
407- unstacked_val = [similar_missing (valuecol, Nrow) for i in 1 : Ncol]
408433 mask_filled = falses (Nrow, Ncol)
409434
410435 @assert length (rowref) == length (colref) == length (valuecol)
0 commit comments