@@ -215,20 +215,17 @@ end
215215"""
216216 unstack(df::AbstractDataFrame, rowkeys, colkey, value;
217217 renamecols::Function=identity, allowmissing::Bool=false,
218- allowduplicates::Bool=false, valuestransform=nothing,
219- fill=missing, threads::Bool=true)
218+ combine=nothing, fill=missing, threads::Bool=true)
220219 unstack(df::AbstractDataFrame, colkey, value;
221220 renamecols::Function=identity, allowmissing::Bool=false,
222- allowduplicates::Bool=false, valuestransform=nothing,
223- fill=missing, threads::Bool=true)
221+ combine=nothing, fill=missing, threads::Bool=true)
224222 unstack(df::AbstractDataFrame;
225223 renamecols::Function=identity, allowmissing::Bool=false,
226- allowduplicates::Bool=false, valuestransform=nothing,
227- fill=missing, threads::Bool=true)
224+ combine=nothing, fill=missing, threads::Bool=true)
228225
229226Unstack data frame `df`, i.e. convert it from long to wide format.
230227
231- Row and column keys will be ordered in the order of their first appearance.
228+ Row and column keys are ordered in the order of their first appearance.
232229
233230# Positional arguments
234231- `df` : the AbstractDataFrame to be unstacked
@@ -246,27 +243,31 @@ Row and column keys will be ordered in the order of their first appearance.
246243 return the name of the column to be created (typically as a string or a
247244 `Symbol`). Duplicates in resulting names when converted to `Symbol` are not
248245 allowed. By default no transformation is performed.
249- - `allowmissing`: if `false` (the default) then an error will be thrown if
246+ - `allowmissing`: if `false` (the default) then an error is thrown if
250247 `colkey` contains `missing` values; if `true` then a column referring to
251- `missing` value will be created.
252- - `allowduplicates`: if `false` (the default) then an error an error will be
253- thrown if combination of `rowkeys` and `colkey` contains duplicate entries; if
254- `true` then the last encountered `value` will be retained;
255- this keyword argument is ignored if `valuestransform` keyword argument is passed.
256- - `valuestransform`: if passed then `allowduplicates` is ignored and instead
257- the passed function will be called on a vector view containing all elements
258- for each combination of `rowkeys` and `colkey` present in the data.
248+ `missing` value is created.
249+ - `combine`: if `only` (the default) then an error is thrown if combination
250+ of `rowkeys` and `colkey` contains duplicate entries. Otherwise the passed
251+ value must be a function that is called on a vector view containing all
252+ elements for each combination of `rowkeys` and `colkey` present in the data.
259253- `fill`: missing row/column combinations are filled with this value. The
260254 default is `missing`. If the `value` column is a `CategoricalVector` and
261255 `fill` is not `missing` then in order to keep unstacked value columns also
262256 `CategoricalVector` the `fill` must be passed as `CategoricalValue`
263- - `threads`: whether `valuestransform` may be run in separate tasks which
264- can execute in parallel (possibly being applied to multiple groups at the same time).
265- Whether or not tasks are actually spawned and their number are determined automatically.
266- Set to `false` if `valuestransform` requires serial execution or is not thread-safe.
257+ - `threads`: whether `combine` function may be run in separate tasks which can
258+ execute in parallel (possibly being applied to multiple groups at the same
259+ time). Whether or not tasks are actually spawned and their number are
260+ determined automatically. Set to `false` if `combine` requires serial
261+ execution or is not thread-safe.
267262
268- Metadata: table-level `:note`-style metadata and column-level `:note`-style metadata
269- for row keys columns are preserved.
263+ Metadata: table-level `:note`-style metadata and column-level `:note`-style
264+ metadata for row keys columns are preserved.
265+
266+ # Deprecations
267+
268+ - `allowduplicates` keyword argument is deprecated; instead use `combine`
269+ keyword argument; an equivalent to `allowduplicates=true` is `combine=last`
270+ and to `allowduplicates=false` is `combine=only` (the default);
270271
271272# Examples
272273
@@ -401,14 +402,14 @@ julia> df = DataFrame(cols=["a", "a", "b"], values=[1, 2, 4])
401402 2 │ a 2
402403 3 │ b 4
403404
404- julia> unstack(df, :cols, :values, valuestransform =copy)
405+ julia> unstack(df, :cols, :values, combine =copy)
4054061×2 DataFrame
406407 Row │ a b
407408 │ Array…? Array…?
408409─────┼──────────────────
409410 1 │ [1, 2] [4]
410411
411- julia> unstack(df, :cols, :values, valuestransform =sum)
412+ julia> unstack(df, :cols, :values, combine =sum)
4124131×2 DataFrame
413414 Row │ a b
414415 │ Int64? Int64?
@@ -418,17 +419,21 @@ julia> unstack(df, :cols, :values, valuestransform=sum)
418419"""
419420function unstack (df:: AbstractDataFrame , rowkeys, colkey:: ColumnIndex ,
420421 values:: ColumnIndex ; renamecols:: Function = identity,
421- allowmissing:: Bool = false , allowduplicates:: Bool = false ,
422- valuestransform= nothing , fill= missing ,
423- threads:: Bool = true )
422+ allowmissing:: Bool = false , allowduplicates:: Bool = false ,
423+ combine= only, fill= missing , threads:: Bool = true )
424+ if allowduplicates
425+ Base. depwarn (" allowduplicates keyword argument is deprecated. " *
426+ " Pass `combine=last` instead of `allowduplicates=true`." , :unstack )
427+ combine = last
428+ end
424429 # first make sure that rowkeys are unique and
425430 # normalize all selectors as a strings
426431 # if some of the selectors are wrong we will get an early error here
427432 rowkeys = names (df, index (df)[rowkeys])
428433 colkey = only (names (df, colkey))
429434 values = only (names (df, values))
430435
431- if ! isnothing (valuestransform)
436+ if combine != = only
432437 # potentially colkey can be also part of rowkeys so we need to do unique
433438 groupcols = unique! ([rowkeys; colkey])
434439 @assert groupcols isa Vector{String}
@@ -441,60 +446,67 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
441446 end
442447
443448 gdf = groupby (df, groupcols)
444- if check_aggregate (valuestransform , df[! , values]) isa AbstractAggregate
445- # if valuestransform function is AbstractAggregate
449+ if check_aggregate (combine , df[! , values]) isa AbstractAggregate
450+ # if combine function is AbstractAggregate
446451 # then we are sure it will return a scalar number so we can
447452 # leave it as is and be sure we use fast path in combine
448- agg_fun = valuestransform
453+ agg_fun = combine
449454 else
450- # in general valuestransform function could return e.g. a vector,
455+ # in general combine function could return e.g. a vector,
451456 # which would get expanded to multiple rows so we protect it with
452457 # Ref that will get unwrapped by combine
453- agg_fun = Ref∘ valuestransform
458+ agg_fun = Ref∘ combine
454459 end
455- df_op = combine (gdf, values => agg_fun => values_out,
456- threads= threads)
460+ df_op = DataFrames . combine (gdf, values => agg_fun => values_out,
461+ threads= threads)
457462
458463 group_rows = find_group_row (gdf)
459464 if ! issorted (group_rows)
460465 df_op = df_op[sortperm (group_rows), :]
461466 end
462- # set allowduplicates to true as we should not have any duplicates now
463- # and allowduplicates=true is a bit faster
464- allowduplicates = true
467+ # we should not have any duplicates in df_op now
468+ noduplicates = true
465469 else
466470 df_op = df
467471 values_out = values
472+ noduplicates = false
468473 end
469474
470475 g_rowkey = groupby (df_op, rowkeys)
471476 g_colkey = groupby (df_op, colkey)
472477 valuecol = df_op[! , values_out]
473478 return _unstack (df_op, index (df_op)[rowkeys], index (df_op)[colkey], g_colkey,
474- valuecol, g_rowkey, renamecols,
475- allowmissing, allowduplicates, fill)
479+ valuecol, g_rowkey, renamecols, allowmissing, noduplicates, fill)
476480end
477481
478482function unstack (df:: AbstractDataFrame , colkey:: ColumnIndex , values:: ColumnIndex ;
479- renamecols:: Function = identity,
480- allowmissing:: Bool = false , allowduplicates:: Bool = false ,
481- valuestransform= nothing , fill= missing ,
482- threads:: Bool = true )
483+ renamecols:: Function = identity, allowmissing:: Bool = false ,
484+ allowduplicates:: Bool = false , combine= only, fill= missing ,
485+ threads:: Bool = true )
486+ if allowduplicates
487+ Base. depwarn (" allowduplicates keyword argument is deprecated. " *
488+ " Pass `combine=last` instead of allowduplicates=true." , :unstack )
489+ combine = last
490+ end
483491 colkey_int = index (df)[colkey]
484492 value_int = index (df)[values]
485493 return unstack (df, Not (colkey_int, value_int), colkey_int, value_int,
486494 renamecols= renamecols, allowmissing= allowmissing,
487- allowduplicates = allowduplicates, valuestransform = valuestransform ,
495+ combine = combine ,
488496 fill= fill, threads= threads)
489497end
490498
491- unstack (df:: AbstractDataFrame ; renamecols:: Function = identity,
492- allowmissing:: Bool = false , allowduplicates:: Bool = false ,
493- valuestransform= nothing , fill= missing ,
494- threads:: Bool = true ) =
499+ function unstack (df:: AbstractDataFrame ; renamecols:: Function = identity,
500+ allowmissing:: Bool = false , allowduplicates:: Bool = false ,
501+ combine= only, fill= missing , threads:: Bool = true )
502+ if allowduplicates
503+ Base. depwarn (" allowduplicates keyword argument is deprecated. " *
504+ " Pass `combine=last` instead of allowduplicates=true." , :unstack )
505+ combine = last
506+ end
495507 unstack (df, :variable , :value , renamecols= renamecols, allowmissing= allowmissing,
496- allowduplicates = allowduplicates, valuestransform = valuestransform,
497- fill = fill, threads = threads)
508+ combine = combine, fill = fill, threads = threads)
509+ end
498510
499511# we take into account the fact that idx, starts and ends are computed lazily
500512# so we rather directly reference the gdf.groups
521533function _unstack (df:: AbstractDataFrame , rowkeys:: AbstractVector{Int} ,
522534 colkey:: Int , g_colkey:: GroupedDataFrame ,
523535 valuecol:: AbstractVector , g_rowkey:: GroupedDataFrame ,
524- renamecols:: Function , allowmissing:: Bool ,
525- allowduplicates:: Bool , fill)
536+ renamecols:: Function , allowmissing:: Bool , noduplicates:: Bool , fill)
526537 rowref = g_rowkey. groups
527538 row_group_row_idxs = find_group_row (g_rowkey)
528539 Nrow = length (g_rowkey)
@@ -543,8 +554,8 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
543554 Nrow),
544555 fill) for _ in 1 : Ncol]
545556
546- # use a separate path for allowduplicates to reduce memory use and increase speed
547- if allowduplicates
557+ # use a separate path for noduplicates to reduce memory use and increase speed
558+ if noduplicates
548559 for (k, (row_id, col_id, val)) in enumerate (zip (rowref, colref, valuecol))
549560 unstacked_val[col_id][row_id] = val
550561 end
@@ -556,7 +567,8 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
556567 bad_var = colref_map[col_id]
557568 throw (ArgumentError (" Duplicate entries in unstack at row $k for key " *
558569 " $bad_key and variable $bad_var . " *
559- " Pass allowduplicates=true to allow them." ))
570+ " Pass `combine` keyword argument to specify " *
571+ " how they should be handled." ))
560572 end
561573 unstacked_val[col_id][row_id] = val
562574 mask_filled[row_id, col_id] = true
0 commit comments