From 5ac36ac541a22cd7a8a7f12b3c20386674b72685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 13 Oct 2020 20:37:19 +0200 Subject: [PATCH 01/20] initial (incorrect) changes --- src/abstractdataframe/reshape.jl | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 7db56ff20a..24cd1bd404 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -165,20 +165,30 @@ Note that there are some differences between the widened results above. """ function unstack(df::AbstractDataFrame, rowkey::ColumnIndex, colkey::ColumnIndex, value::ColumnIndex; renamecols::Function=identity) - refkeycol = categorical(df[!, rowkey]) - droplevels!(refkeycol) - keycol = categorical(df[!, colkey]) - droplevels!(keycol) + refkeycol = df[!, rowkey] + refkeycol isa PooledVector || (refkeycol = PooledArray(refkeycol)) + keycol = df[!, colkey] + keycol isa PooledVector || (keycol = PooledArray(keycol)) valuecol = df[!, value] return _unstack(df, index(df)[rowkey], index(df)[colkey], keycol, valuecol, refkeycol, renamecols) end +function preprocess_pooledvector(v::PooledVector) + used = falses(length(v.pool)) + for x in v.refs + used[x] |= true + end + v_unique = sum(used) + v_missing = something(findfirst(isequal(missing), v.pool), 0) + return v_unique, v_missing +end + function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, - keycol::CategoricalVector, valuecol::AbstractVector, - refkeycol::CategoricalVector, renamecols::Function) - Nrow = length(refkeycol.pool) - Ncol = length(keycol.pool) + keycol::PooledVector, valuecol::AbstractVector, + refkeycol::PooledVector, renamecols::Function) + Nrow, refkeycol_missing = preprocess_pooledvector(refkeycol) + Ncol, keycol_missing = preprocess_pooledvector(keycol) unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] hadmissing = false # have we encountered missing in refkeycol mask_filled = falses(Nrow+1, Ncol) # has a given [row,col] entry been filled? @@ -186,7 +196,7 @@ function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, warned_missing = false # have we already printed missing in keycol warning? for k in 1:nrow(df) kref = keycol.refs[k] - if kref <= 0 # we have found missing in colkey + if kref == keycol_missing # we have found missing in colkey if !warned_missing @warn("Missing value in variable :$(_names(df)[colkey]) at row $k. Skipping.") warned_missing = true @@ -194,7 +204,7 @@ function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, continue # skip processing it end refkref = refkeycol.refs[k] - if refkref <= 0 # we have found missing in rowkey + if refkref == keycol_missing # we have found missing in rowkey if !hadmissing # if it is the first time we have to add a new row hadmissing = true # we use the fact that missing is greater than anything From f2ad670033c4c9f4b29beecbc07b156772128669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 21 Oct 2020 15:32:44 +0200 Subject: [PATCH 02/20] finalize unstack rework --- NEWS.md | 3 + src/abstractdataframe/reshape.jl | 424 ++++++++++++++++++++++--------- test/reshape.jl | 142 ++++++++--- 3 files changed, 405 insertions(+), 164 deletions(-) diff --git a/NEWS.md b/NEWS.md index 504d2386b3..6684d60d2c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -41,6 +41,9 @@ * in `describe` the specification of custom aggregation is now `function => name`; old `name => function` order is now deprecated ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401)) +* `unstack` now does not depend on CategoricalArrays.jl and has two new keyword + arguments `allowmissing` and `allowduplicates` + ([]()) ## New functionalities diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 24cd1bd404..0406a02dd5 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -43,16 +43,93 @@ that return views into the original data frame. # Examples ```julia -d1 = DataFrame(a = repeat([1:3;], inner = [4]), - b = repeat([1:4;], inner = [3]), - c = randn(12), - d = randn(12), - e = map(string, 'a':'l')) - -d1s = stack(d1, [:c, :d]) -d1s2 = stack(d1, [:c, :d], [:a]) -d1m = stack(d1, Not([:a, :b, :e])) -d1s_name = stack(d1, Not([:a, :b, :e]), variable_name=:somemeasure) +julia> df = DataFrame(a = repeat([1:3;], inner = [2]), + b = repeat([1:2;], inner = [3]), + c = randn(6), + d = randn(), + e = map(string, 'a':'f')) +6×5 DataFrame +│ Row │ a │ b │ c │ d │ e │ +│ │ Int64 │ Int64 │ Float64 │ Float64 │ String │ +├─────┼───────┼───────┼──────────┼──────────┼────────┤ +│ 1 │ 1 │ 1 │ -1.1078 │ 0.680175 │ a │ +│ 2 │ 1 │ 1 │ 0.078634 │ 0.680175 │ b │ +│ 3 │ 2 │ 1 │ -1.47615 │ 0.680175 │ c │ +│ 4 │ 2 │ 2 │ 0.826434 │ 0.680175 │ d │ +│ 5 │ 3 │ 2 │ 0.597258 │ 0.680175 │ e │ +│ 6 │ 3 │ 2 │ 1.49645 │ 0.680175 │ f │ + +julia> stack(df, [:c, :d]) +12×5 DataFrame +│ Row │ a │ b │ e │ variable │ value │ +│ │ Int64 │ Int64 │ String │ String │ Float64 │ +├─────┼───────┼───────┼────────┼──────────┼──────────┤ +│ 1 │ 1 │ 1 │ a │ c │ -1.1078 │ +│ 2 │ 1 │ 1 │ b │ c │ 0.078634 │ +│ 3 │ 2 │ 1 │ c │ c │ -1.47615 │ +│ 4 │ 2 │ 2 │ d │ c │ 0.826434 │ +│ 5 │ 3 │ 2 │ e │ c │ 0.597258 │ +│ 6 │ 3 │ 2 │ f │ c │ 1.49645 │ +│ 7 │ 1 │ 1 │ a │ d │ 0.680175 │ +│ 8 │ 1 │ 1 │ b │ d │ 0.680175 │ +│ 9 │ 2 │ 1 │ c │ d │ 0.680175 │ +│ 10 │ 2 │ 2 │ d │ d │ 0.680175 │ +│ 11 │ 3 │ 2 │ e │ d │ 0.680175 │ +│ 12 │ 3 │ 2 │ f │ d │ 0.680175 │ + +julia> stack(df, [:c, :d], [:a]) +12×3 DataFrame +│ Row │ a │ variable │ value │ +│ │ Int64 │ String │ Float64 │ +├─────┼───────┼──────────┼──────────┤ +│ 1 │ 1 │ c │ -1.1078 │ +│ 2 │ 1 │ c │ 0.078634 │ +│ 3 │ 2 │ c │ -1.47615 │ +│ 4 │ 2 │ c │ 0.826434 │ +│ 5 │ 3 │ c │ 0.597258 │ +│ 6 │ 3 │ c │ 1.49645 │ +│ 7 │ 1 │ d │ 0.680175 │ +│ 8 │ 1 │ d │ 0.680175 │ +│ 9 │ 2 │ d │ 0.680175 │ +│ 10 │ 2 │ d │ 0.680175 │ +│ 11 │ 3 │ d │ 0.680175 │ +│ 12 │ 3 │ d │ 0.680175 │ + +julia> stack(df, Not([:a, :b, :e])) +12×5 DataFrame +│ Row │ a │ b │ e │ variable │ value │ +│ │ Int64 │ Int64 │ String │ String │ Float64 │ +├─────┼───────┼───────┼────────┼──────────┼──────────┤ +│ 1 │ 1 │ 1 │ a │ c │ -1.1078 │ +│ 2 │ 1 │ 1 │ b │ c │ 0.078634 │ +│ 3 │ 2 │ 1 │ c │ c │ -1.47615 │ +│ 4 │ 2 │ 2 │ d │ c │ 0.826434 │ +│ 5 │ 3 │ 2 │ e │ c │ 0.597258 │ +│ 6 │ 3 │ 2 │ f │ c │ 1.49645 │ +│ 7 │ 1 │ 1 │ a │ d │ 0.680175 │ +│ 8 │ 1 │ 1 │ b │ d │ 0.680175 │ +│ 9 │ 2 │ 1 │ c │ d │ 0.680175 │ +│ 10 │ 2 │ 2 │ d │ d │ 0.680175 │ +│ 11 │ 3 │ 2 │ e │ d │ 0.680175 │ +│ 12 │ 3 │ 2 │ f │ d │ 0.680175 │ + +julia> stack(df, Not([:a, :b, :e]), variable_name=:somemeasure) +12×5 DataFrame +│ Row │ a │ b │ e │ somemeasure │ value │ +│ │ Int64 │ Int64 │ String │ String │ Float64 │ +├─────┼───────┼───────┼────────┼─────────────┼──────────┤ +│ 1 │ 1 │ 1 │ a │ c │ -1.1078 │ +│ 2 │ 1 │ 1 │ b │ c │ 0.078634 │ +│ 3 │ 2 │ 1 │ c │ c │ -1.47615 │ +│ 4 │ 2 │ 2 │ d │ c │ 0.826434 │ +│ 5 │ 3 │ 2 │ e │ c │ 0.597258 │ +│ 6 │ 3 │ 2 │ f │ c │ 1.49645 │ +│ 7 │ 1 │ 1 │ a │ d │ 0.680175 │ +│ 8 │ 1 │ 1 │ b │ d │ 0.680175 │ +│ 9 │ 2 │ 1 │ c │ d │ 0.680175 │ +│ 10 │ 2 │ 2 │ d │ d │ 0.680175 │ +│ 11 │ 3 │ 2 │ e │ d │ 0.680175 │ +│ 12 │ 3 │ 2 │ f │ d │ 0.680175 │ ``` """ function stack(df::AbstractDataFrame, @@ -121,19 +198,20 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int}, end """ - unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity) - unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity) - unstack(df::AbstractDataFrame; renamecols::Function=identity) + unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) + unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) + unstack(df::AbstractDataFrame; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) Unstack data frame `df`, i.e. convert it from long to wide format. -If `colkey` contains `missing` values then they will be skipped and a warning -will be printed. +Row keys and values from value column will be sorted by default unless they are +not ordered (i.e. passing them to `sort` fails) in which case the order of the +result is unspecified. -If combination of `rowkeys` and `colkey` contains duplicate entries then last -`value` will be retained and a warning will be printed. - -# Arguments +# Positional arguments - `df` : the AbstractDataFrame to be unstacked - `rowkeys` : the columns with a unique key for each row, if not given, find a key by grouping on anything not a `colkey` or `value`. @@ -141,127 +219,219 @@ If combination of `rowkeys` and `colkey` contains duplicate entries then last - `colkey` : the column ($COLUMNINDEX_STR) holding the column names in wide format, defaults to `:variable` - `value` : the value column ($COLUMNINDEX_STR), defaults to `:value` -- `renamecols` : a function called on each unique value in `colkey` which must - return the name of the column to be created (typically as a string - or a `Symbol`). Duplicate names are not allowed. +# Keyword arguments + +`renamecols` is a function called on each unique value in `colkey` which must +return the name of the column to be created (typically as a string or a +`Symbol`). Duplicates in resulting names when converted to `Symbol` are not allowed. + +If `colkey` contains `missing` values then they will be included if +`allowmissing=true` and an error will be thrown otherwise (the default). + +If combination of `rowkeys` and `colkey` contains duplicate entries then last +`value` will be retained and a warning will be printed if `allowduplicates=true` +and an error will be thrown otherwise (the default). # Examples + ```julia -wide = DataFrame(id = 1:12, - a = repeat([1:3;], inner = [4]), - b = repeat([1:4;], inner = [3]), - c = randn(12), - d = randn(12)) - -long = stack(wide) -wide0 = unstack(long) -wide1 = unstack(long, :variable, :value) -wide2 = unstack(long, :id, :variable, :value) -wide3 = unstack(long, [:id, :a], :variable, :value) -wide4 = unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) +julia> wide = DataFrame(id = 1:6, + a = repeat([1:3;], inner = [2]), + b = repeat([1:2;], inner = [3]), + c = randn(6), + d = randn(6)) +6×5 DataFrame +│ Row │ id │ a │ b │ c │ d │ +│ │ Int64 │ Int64 │ Int64 │ Float64 │ Float64 │ +├─────┼───────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ +│ 2 │ 2 │ 1 │ 1 │ -0.917794 │ 0.940007 │ +│ 3 │ 3 │ 2 │ 1 │ 0.309629 │ 0.820397 │ +│ 4 │ 4 │ 2 │ 2 │ 1.46677 │ -1.03457 │ +│ 5 │ 5 │ 3 │ 2 │ 1.04339 │ -0.770464 │ +│ 6 │ 6 │ 3 │ 2 │ -0.172475 │ -2.81039 │ + +julia> long = stack(wide) +12×5 DataFrame +│ Row │ id │ a │ b │ variable │ value │ +│ │ Int64 │ Int64 │ Int64 │ String │ Float64 │ +├─────┼───────┼───────┼───────┼──────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ c │ 1.20649 │ +│ 2 │ 2 │ 1 │ 1 │ c │ -0.917794 │ +│ 3 │ 3 │ 2 │ 1 │ c │ 0.309629 │ +│ 4 │ 4 │ 2 │ 2 │ c │ 1.46677 │ +│ 5 │ 5 │ 3 │ 2 │ c │ 1.04339 │ +│ 6 │ 6 │ 3 │ 2 │ c │ -0.172475 │ +│ 7 │ 1 │ 1 │ 1 │ d │ -1.27628 │ +│ 8 │ 2 │ 1 │ 1 │ d │ 0.940007 │ +│ 9 │ 3 │ 2 │ 1 │ d │ 0.820397 │ +│ 10 │ 4 │ 2 │ 2 │ d │ -1.03457 │ +│ 11 │ 5 │ 3 │ 2 │ d │ -0.770464 │ +│ 12 │ 6 │ 3 │ 2 │ d │ -2.81039 │ + +julia> unstack(long) +6×5 DataFrame +│ Row │ id │ a │ b │ c │ d │ +│ │ Int64 │ Int64 │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ +│ 2 │ 2 │ 1 │ 1 │ -0.917794 │ 0.940007 │ +│ 3 │ 3 │ 2 │ 1 │ 0.309629 │ 0.820397 │ +│ 4 │ 4 │ 2 │ 2 │ 1.46677 │ -1.03457 │ +│ 5 │ 5 │ 3 │ 2 │ 1.04339 │ -0.770464 │ +│ 6 │ 6 │ 3 │ 2 │ -0.172475 │ -2.81039 │ + +julia> unstack(long, :variable, :value) +6×5 DataFrame +│ Row │ id │ a │ b │ c │ d │ +│ │ Int64 │ Int64 │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ +│ 2 │ 2 │ 1 │ 1 │ -0.917794 │ 0.940007 │ +│ 3 │ 3 │ 2 │ 1 │ 0.309629 │ 0.820397 │ +│ 4 │ 4 │ 2 │ 2 │ 1.46677 │ -1.03457 │ +│ 5 │ 5 │ 3 │ 2 │ 1.04339 │ -0.770464 │ +│ 6 │ 6 │ 3 │ 2 │ -0.172475 │ -2.81039 │ + +julia> unstack(long, :id, :variable, :value) +6×3 DataFrame +│ Row │ id │ c │ d │ +│ │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1.20649 │ -1.27628 │ +│ 2 │ 2 │ -0.917794 │ 0.940007 │ +│ 3 │ 3 │ 0.309629 │ 0.820397 │ +│ 4 │ 4 │ 1.46677 │ -1.03457 │ +│ 5 │ 5 │ 1.04339 │ -0.770464 │ +│ 6 │ 6 │ -0.172475 │ -2.81039 │ + +julia> unstack(long, [:id, :a], :variable, :value) +6×4 DataFrame +│ Row │ id │ a │ c │ d │ +│ │ Int64 │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ +│ 2 │ 2 │ 1 │ -0.917794 │ 0.940007 │ +│ 3 │ 3 │ 2 │ 0.309629 │ 0.820397 │ +│ 4 │ 4 │ 2 │ 1.46677 │ -1.03457 │ +│ 5 │ 5 │ 3 │ 1.04339 │ -0.770464 │ +│ 6 │ 6 │ 3 │ -0.172475 │ -2.81039 │ + +julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) +6×3 DataFrame +│ Row │ id │ _c │ _d │ +│ │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1.20649 │ -1.27628 │ +│ 2 │ 2 │ -0.917794 │ 0.940007 │ +│ 3 │ 3 │ 0.309629 │ 0.820397 │ +│ 4 │ 4 │ 1.46677 │ -1.03457 │ +│ 5 │ 5 │ 1.04339 │ -0.770464 │ +│ 6 │ 6 │ -0.172475 │ -2.81039 │ ``` Note that there are some differences between the widened results above. """ function unstack(df::AbstractDataFrame, rowkey::ColumnIndex, colkey::ColumnIndex, - value::ColumnIndex; renamecols::Function=identity) + value::ColumnIndex; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) refkeycol = df[!, rowkey] - refkeycol isa PooledVector || (refkeycol = PooledArray(refkeycol)) keycol = df[!, colkey] - keycol isa PooledVector || (keycol = PooledArray(keycol)) valuecol = df[!, value] return _unstack(df, index(df)[rowkey], index(df)[colkey], - keycol, valuecol, refkeycol, renamecols) + keycol, valuecol, refkeycol, renamecols, allowmissing, allowduplicates) end -function preprocess_pooledvector(v::PooledVector) - used = falses(length(v.pool)) - for x in v.refs - used[x] |= true - end - v_unique = sum(used) - v_missing = something(findfirst(isequal(missing), v.pool), 0) - return v_unique, v_missing +function _unstack_preprocess_vector(v::AbstractVector) + v_unique = unique(v) + had_missing = any(ismissing, v_unique) + v_unique = intersect(levels(v), v_unique) + had_missing && (v_unique = vcat(v_unique, [missing])) + len_v = length(v_unique) + v_map = Dict([x => i for (i,x) in enumerate(v_unique)]) + # both unique and Dict should use isequal to test for identity of values + @assert length(v_map) == length(v_unique) + # if there are no missings in v then set reference index of missing to 0 + col = similar(v, length(v_unique)) + copyto!(col, v_unique) + return col, v_map, get(v_map, missing, 0) end function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, - keycol::PooledVector, valuecol::AbstractVector, - refkeycol::PooledVector, renamecols::Function) - Nrow, refkeycol_missing = preprocess_pooledvector(refkeycol) - Ncol, keycol_missing = preprocess_pooledvector(keycol) + keycol::AbstractVector, valuecol::AbstractVector, + refkeycol::AbstractVector, renamecols::Function, + allowmissing::Bool, allowduplicates::Bool) + col, refkeycol_map, refkeycol_missing = _unstack_preprocess_vector(refkeycol) + Nrow = length(refkeycol_map) + colnames, keycol_map, keycol_missing = _unstack_preprocess_vector(keycol) + Ncol = length(keycol_map) + + if keycol_missing != 0 && !allowmissing + throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " * + "Pass `allowmissing=true` to skip missings.")) + end + unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] - hadmissing = false # have we encountered missing in refkeycol - mask_filled = falses(Nrow+1, Ncol) # has a given [row,col] entry been filled? - warned_dup = false # have we already printed duplicate entries warning? - warned_missing = false # have we already printed missing in keycol warning? + mask_filled = falses(Nrow, Ncol) # has a given [row,col] entry been filled? for k in 1:nrow(df) - kref = keycol.refs[k] - if kref == keycol_missing # we have found missing in colkey - if !warned_missing - @warn("Missing value in variable :$(_names(df)[colkey]) at row $k. Skipping.") - warned_missing = true - end - continue # skip processing it - end - refkref = refkeycol.refs[k] - if refkref == keycol_missing # we have found missing in rowkey - if !hadmissing # if it is the first time we have to add a new row - hadmissing = true - # we use the fact that missing is greater than anything - for i in eachindex(unstacked_val) - push!(unstacked_val[i], missing) - end - end - i = length(unstacked_val[1]) - else - i = refkref - end - if !warned_dup && mask_filled[i, kref] - @warn("Duplicate entries in unstack at row $k for key "* - "$(refkeycol[k]) and variable $(keycol[k]).") - warned_dup = true + kref = keycol_map[keycol[k]] + refkref = refkeycol_map[refkeycol[k]] + if !allowduplicates && mask_filled[refkref, kref] + throw(ArgumentError("Duplicate entries in unstack at row $k for key "* + "$(refkeycol[k]) and variable $(keycol[k]). " * + "Pass allowduplicates=true to allow them.")) end - unstacked_val[kref][i] = valuecol[k] - mask_filled[i, kref] = true + unstacked_val[kref][refkref] = valuecol[k] + mask_filled[refkref, kref] = true end - levs = levels(refkeycol) - # we have to handle a case with missings in refkeycol as levs will skip missing - col = similar(df[!, rowkey], length(levs) + hadmissing) - copyto!(col, levs) - hadmissing && (col[end] = missing) - df2 = DataFrame(unstacked_val, Symbol.(renamecols.(levels(keycol))), copycols=false) - return insertcols!(df2, 1, _names(df)[rowkey] => col) + # note that Symbol.(renamecols.(colnames)) must produce unique column names + # and _names(df)[rowkey] must also produce a unique name + df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colnames)), copycols=false) + return insertcols!(df2, 1, _names(df)[rowkey] => col, copycols=false) end function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, - value::ColumnIndex; renamecols::Function=identity) + value::ColumnIndex; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) rowkey_ints = index(df)[rowkeys] @assert rowkey_ints isa AbstractVector{Int} length(rowkey_ints) == 0 && throw(ArgumentError("No key column found")) length(rowkey_ints) == 1 && return unstack(df, rowkey_ints[1], colkey, value, - renamecols=renamecols) - g = groupby(df, rowkey_ints, sort=true) - keycol = categorical(df[!, colkey]) - droplevels!(keycol) + renamecols=renamecols, + allowmissing=allowmissing, + allowduplicates=allowduplicates) + local g + try + g = groupby(df, rowkey_ints, sort=true) + catch + g = groupby(df, rowkey_ints, sort=false) + end + keycol = df[!, colkey] valuecol = df[!, value] - return _unstack(df, rowkey_ints, index(df)[colkey], keycol, valuecol, g, renamecols) + return _unstack(df, rowkey_ints, index(df)[colkey], keycol, valuecol, g, + renamecols, allowmissing, allowduplicates) end function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex; - renamecols::Function=identity) + renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) colkey_int = index(df)[colkey] value_int = index(df)[value] return unstack(df, Not(colkey_int, value_int), colkey_int, value_int, - renamecols=renamecols) + renamecols=renamecols, allowmissing=allowmissing, + allowduplicates=allowduplicates) end -unstack(df::AbstractDataFrame; renamecols::Function=identity) = - unstack(df, :variable, :value, renamecols=renamecols) +unstack(df::AbstractDataFrame; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) = + unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing, + allowduplicates=allowduplicates) function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, - colkey::Int, keycol::CategoricalVector, + colkey::Int, keycol::AbstractVector, valuecol::AbstractVector, g::GroupedDataFrame, - renamecols::Function) + renamecols::Function, + allowmissing::Bool, allowduplicates::Bool) idx, starts, ends = g.idx, g.starts, g.ends groupidxs = [idx[starts[i]:ends[i]] for i in 1:length(starts)] rowkey = zeros(Int, size(df, 1)) @@ -270,30 +440,31 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, end df1 = df[idx[starts], g.cols] Nrow = length(g) - Ncol = length(levels(keycol)) + + colnames, keycol_map, keycol_missing = _unstack_preprocess_vector(keycol) + Ncol = length(keycol_map) + + if keycol_missing != 0 && !allowmissing + throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." * + " Pass `allowmissing=true` to skip missings.")) + end + unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] mask_filled = falses(Nrow, Ncol) - warned_dup = false - warned_missing = false for k in 1:nrow(df) - kref = keycol.refs[k] - if kref <= 0 - if !warned_missing - @warn("Missing value in variable :$(_names(df)[colkey]) at row $k. Skipping.") - warned_missing = true - end - continue - end + kref = keycol_map[keycol[k]] i = rowkey[k] - if !warned_dup && mask_filled[i, kref] - @warn("Duplicate entries in unstack at row $k for key "* - "$(tuple((df[k,s] for s in rowkeys)...)) and variable $(keycol[k]).") - warned_dup = true + if !allowduplicates && mask_filled[i, kref] + throw(ArgumentError("Duplicate entries in unstack at row $k for key "* + "$(tuple((df[k,s] for s in rowkeys)...)) and variable $(keycol[k]). " * + "Pass allowduplicates=true to allow them.")) end unstacked_val[kref][i] = valuecol[k] mask_filled[i, kref] = true end - df2 = DataFrame(unstacked_val, Symbol.(renamecols.(levels(keycol))), copycols=false) + # note that Symbol.(renamecols.(colnames)) must produce unique column names + # and names between df1 and df2 must be unique + df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colnames)), copycols=false) hcat(df1, df2, copycols=false) end @@ -346,10 +517,6 @@ Base.eltype(v::Type{StackedVector{T}}) where {T} = T Base.similar(v::StackedVector, T::Type, dims::Union{Integer, AbstractUnitRange}...) = similar(v.components[1], T, dims...) -CategoricalArrays.CategoricalArray(v::StackedVector) = - CategoricalArray(v[:]) # could be more efficient - - """ RepeatedVector{T} <: AbstractVector{T} @@ -387,8 +554,10 @@ end Base.parent(v::RepeatedVector) = v.parent DataAPI.levels(v::RepeatedVector) = levels(parent(v)) -CategoricalArrays.isordered(v::RepeatedVector{<:Union{CategoricalValue, Missing}}) = - isordered(parent(v)) + +# TODO: uncomment when DataAPI.jl supports this +# DataAPI.isordered(v::RepeatedVector) = +# isordered(parent(v)) function Base.getindex(v::RepeatedVector, i::Int) N = length(parent(v)) @@ -404,8 +573,9 @@ Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(parent(v)), v.inner, v. Base.similar(v::RepeatedVector, T::Type, dims::Dims) = similar(parent(v), T, dims) Base.unique(v::RepeatedVector) = unique(parent(v)) -function CategoricalArrays.CategoricalArray(v::RepeatedVector) - res = CategoricalArray(parent(v), levels=levels(parent(v))) - res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) - res -end +# TODO: @nalimilan: is there a generic way to support this? +# function CategoricalArrays.CategoricalArray(v::RepeatedVector) +# res = CategoricalArray(parent(v), levels=levels(parent(v))) +# res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) +# res +# end diff --git a/test/reshape.jl b/test/reshape.jl index 6e215e7e92..638b2f093b 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -1,6 +1,6 @@ module TestReshape -using Test, DataFrames, Random, Logging, PooledArrays, CategoricalArrays +using Test, DataFrames, Random, PooledArrays, CategoricalArrays const ≅ = isequal @testset "the output of unstack" begin @@ -143,46 +143,47 @@ end df = DataFrame(id=Union{Int, Missing}[1, 2, 1, 2], id2=Union{Int, Missing}[1, 2, 1, 2], variable=["a", "b", "a", "b"], value=[3, 4, 5, 6]) - @test_logs (:warn, "Duplicate entries in unstack at row 3 for key 1 and variable a.") unstack(df, :id, :variable, :value) - @test_logs (:warn, "Duplicate entries in unstack at row 3 for key (1, 1) and variable a.") unstack(df, :variable, :value) - a, b = with_logger(NullLogger()) do - unstack(df, :id, :variable, :value), unstack(df, :variable, :value) - end + @test_throws ArgumentError unstack(df, :id, :variable, :value) + @test_throws ArgumentError unstack(df, :variable, :value) + a = unstack(df, :id, :variable, :value, allowduplicates=true) + b = unstack(df, :variable, :value, allowduplicates=true) @test a ≅ DataFrame(id = [1, 2], a = [5, missing], b = [missing, 6]) @test b ≅ DataFrame(id = [1, 2], id2 = [1, 2], a = [5, missing], b = [missing, 6]) df = DataFrame(id=1:2, variable=["a", "b"], value=3:4) - @test_nowarn unstack(df, :id, :variable, :value) - @test_nowarn unstack(df, :variable, :value) a = unstack(df, :id, :variable, :value) b = unstack(df, :variable, :value) @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4]) df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1]) - @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value) - @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value) + @test_throws ArgumentError unstack(df, :variable, :value) + @test_throws ArgumentError unstack(df, :id, :variable, :value) + @test unstack(df, :variable, :value, allowduplicates=true) ≅ DataFrame(id=1, x=missing) + @test unstack(df, :id, :variable, :value, allowduplicates=true) ≅ DataFrame(id=1, x=missing) end @testset "missing values in colkey" begin df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2], variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"], value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0]) - @test_logs (:warn, "Missing value in variable :variable at row 3. Skipping.") unstack(df, :variable, :value) - udf = with_logger(NullLogger()) do - unstack(df, :variable, :value) - end - @test propertynames(udf) == [:id, :a, :b, :missing] + @test_throws ArgumentError unstack(df, :variable, :value) + @test_throws ArgumentError unstack(df, :variable, :value, allowmissing=true) + udf = unstack(df, :variable, :value, allowmissing=true, renamecols=x -> coalesce(x, "MISSING")) + @test propertynames(udf) == [:id, :a, :b, :missing, :MISSING] @test udf[!, :missing] ≅ [missing, 9.0, missing] + @test udf[!, :MISSING] ≅ [3.0, missing, missing] + df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2], id2=[1, 1, 1, missing, missing, missing, 2, 2, 2], variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"], value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0]) - @test_logs (:warn, "Missing value in variable :variable at row 3. Skipping.") unstack(df, 3, 4) - udf = with_logger(NullLogger()) do - unstack(df, 3, 4) - end - @test propertynames(udf) == [:id, :id2, :a, :b, :missing] + @test_throws ArgumentError unstack(df, 3, 4) + @test_throws ArgumentError unstack(df, 3, 4, allowmissing=true) + udf = unstack(df, 3, 4, allowmissing=true, renamecols=x -> coalesce(x, "MISSING")) + + @test propertynames(udf) == [:id, :id2, :a, :b, :missing, :MISSING] @test udf[!, :missing] ≅ [missing, 9.0, missing] + @test udf[!, :MISSING] ≅ [3.0, missing, missing] end @testset "stack-unstack correctness" begin @@ -419,23 +420,25 @@ end @test_throws ArgumentError flatten(df_bad, [:b, :c]) end -@testset "test RepeatedVector for categorical" begin - v = categorical(["a", "b", "c"], ordered=true) - levels!(v, ["b", "c", "a"]) - rv = DataFrames.RepeatedVector(v, 1, 1) - @test isordered(v) - @test isordered(categorical(v)) - @test levels(v) == ["b", "c", "a"] - @test levels(categorical(v)) == ["b", "c", "a"] - - v = categorical(["a", "b", "c"]) - levels!(v, ["b", "c", "a"]) - rv = DataFrames.RepeatedVector(v, 1, 1) - @test !isordered(v) - @test !isordered(categorical(v)) - @test levels(v) == ["b", "c", "a"] - @test levels(categorical(v)) == ["b", "c", "a"] -end +# TODO: uncomment these tests when we improve handling of categorical here + +# @testset "test RepeatedVector for categorical" begin +# v = categorical(["a", "b", "c"], ordered=true) +# levels!(v, ["b", "c", "a"]) +# rv = DataFrames.RepeatedVector(v, 1, 1) +# @test isordered(rv) +# @test isordered(categorical(rv)) +# @test levels(rv) == ["b", "c", "a"] +# @test levels(categorical(rv)) == ["b", "c", "a"] + +# v = categorical(["a", "b", "c"]) +# levels!(v, ["b", "c", "a"]) +# rv = DataFrames.RepeatedVector(v, 1, 1) +# @test !isordered(rv) +# @test !isordered(categorical(rv)) +# @test levels(rv) == ["b", "c", "a"] +# @test levels(categorical(rv)) == ["b", "c", "a"] +# end @testset "stack categorical test" begin Random.seed!(1234) @@ -507,4 +510,69 @@ end @test eltype(typeof(sdf2.value)) === Float64 end +@testset "additional unstack tests" begin + df = DataFrame(id=repeat(1:3, inner=3), + id2=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + @test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) == + DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + + Random.seed!(1234) + for i in 1:3 + df = df[Random.shuffle(1:9), :] + @test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) == + DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + end + + df = DataFrame(id=repeat(1:3, inner=3), + a=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + @test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test_throws ArgumentError unstack(df, :a, :var, :val) + @test_throws ArgumentError unstack(df, [:id, :a], :var, :val) + + df = DataFrame(id=repeat(1:3, inner=3), + id2=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + df[4, 1:2] .= 1 + @test_throws ArgumentError unstack(df, :id, :var, :val) + @test_throws ArgumentError unstack(df, [:id, :id2], :var, :val) + @test unstack(df, :id, :var, :val, allowduplicates=true) ≅ + DataFrame(id=1:3, a=[4, missing, 7], b=2:3:8, c=3:3:9) + @test unstack(df, [:id, :id2], :var, :val, allowduplicates=true) ≅ + DataFrame(id=1:3, id2=1:3, a=[4, missing, 7], b=2:3:8, c=3:3:9) + + df = DataFrame(id=repeat(1:3, inner=3), + id2=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + allowmissing!(df, :var) + df.var[4] = missing + @test_throws ArgumentError unstack(df, :id, :var, :val) + @test_throws ArgumentError unstack(df, [:id, :id2], :var, :val) + @test unstack(df, :id, :var, :val, allowmissing=true) ≅ + DataFrame(id=1:3, a=[1, missing, 7], b=2:3:8, c=3:3:9, missing=[missing, 4, missing]) + @test unstack(df, [:id, :id2], :var, :val, allowmissing=true) ≅ + DataFrame(id=1:3, id2=1:3, a=[1, missing, 7], b=2:3:8, c=3:3:9, missing=[missing, 4, missing]) + + # test scenario when sorting fails both in grouping and in variable + struct A + x + end + + df = DataFrame(id=repeat(A.([2, 1, 3]), inner=3), + id2=repeat(A.([2, 1, 3]), inner=3), + var=repeat(A.([3, 2, 1]), 3), + val=1:9) + @test unstack(df, :id, :var, :val, renamecols=x -> Symbol(:x, x.x)) == + DataFrame(id=A.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) + @test unstack(df, [:id, :id2], :var, :val, renamecols=x -> Symbol(:x, x.x)) == + DataFrame(id=A.([2, 1, 3]), id2=A.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) +end + end # module From a9d7fb79539c62d0306364de71beb15c596147ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 21 Oct 2020 15:36:03 +0200 Subject: [PATCH 03/20] update NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7124f7f908..2111aca430 100644 --- a/NEWS.md +++ b/NEWS.md @@ -43,7 +43,7 @@ ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401)) * `unstack` now does not depend on CategoricalArrays.jl and has two new keyword arguments `allowmissing` and `allowduplicates` - ([]()) + ([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494)) ## New functionalities From 85b1e9a6ff0d1f243f79a8540373af220ccd024a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 21 Oct 2020 21:13:23 +0200 Subject: [PATCH 04/20] fix test --- test/reshape.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/reshape.jl b/test/reshape.jl index 4eda4aa8a5..4159104946 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -573,6 +573,7 @@ end DataFrame(id=A.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) @test unstack(df, [:id, :id2], :var, :val, renamecols=x -> Symbol(:x, x.x)) == DataFrame(id=A.([2, 1, 3]), id2=A.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) +end @testset "permutedims" begin df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2)) From 1c14d07a463a076608e89955dae223dcb8e3d348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 22 Oct 2020 11:11:27 +0200 Subject: [PATCH 05/20] fix local scope limitation --- test/reshape.jl | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/test/reshape.jl b/test/reshape.jl index 4159104946..30e4a3908b 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -559,20 +559,22 @@ end DataFrame(id=1:3, a=[1, missing, 7], b=2:3:8, c=3:3:9, missing=[missing, 4, missing]) @test unstack(df, [:id, :id2], :var, :val, allowmissing=true) ≅ DataFrame(id=1:3, id2=1:3, a=[1, missing, 7], b=2:3:8, c=3:3:9, missing=[missing, 4, missing]) +end - # test scenario when sorting fails both in grouping and in variable - struct A - x - end +# test scenario when sorting fails both in grouping and in variable +struct A_TYPE + x +end - df = DataFrame(id=repeat(A.([2, 1, 3]), inner=3), - id2=repeat(A.([2, 1, 3]), inner=3), - var=repeat(A.([3, 2, 1]), 3), +@testset "additional unstack tests not sortable" begin + df = DataFrame(id=repeat(A_TYPE.([2, 1, 3]), inner=3), + id2=repeat(A_TYPE.([2, 1, 3]), inner=3), + var=repeat(A_TYPE.([3, 2, 1]), 3), val=1:9) @test unstack(df, :id, :var, :val, renamecols=x -> Symbol(:x, x.x)) == - DataFrame(id=A.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) + DataFrame(id=A_TYPE.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) @test unstack(df, [:id, :id2], :var, :val, renamecols=x -> Symbol(:x, x.x)) == - DataFrame(id=A.([2, 1, 3]), id2=A.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) + DataFrame(id=A_TYPE.([2, 1, 3]), id2=A_TYPE.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) end @testset "permutedims" begin From 7ff37c39babcbc501bae6b7235b5177ba88d41f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 26 Oct 2020 14:16:55 +0100 Subject: [PATCH 06/20] updates after the review --- src/abstractdataframe/reshape.jl | 35 +++++++++++++++++--------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 56d07a2918..9f1171b648 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -222,16 +222,16 @@ result is unspecified. # Keyword arguments -`renamecols` is a function called on each unique value in `colkey` which must -return the name of the column to be created (typically as a string or a -`Symbol`). Duplicates in resulting names when converted to `Symbol` are not allowed. - -If `colkey` contains `missing` values then they will be included if -`allowmissing=true` and an error will be thrown otherwise (the default). - -If combination of `rowkeys` and `colkey` contains duplicate entries then last -`value` will be retained and a warning will be printed if `allowduplicates=true` -and an error will be thrown otherwise (the default). +- `renamecols`: a function called on each unique value in `colkey`; it must return + the name of the column to be created (typically as a string or a `Symbol`). + Duplicates in resulting names when converted to `Symbol` are not allowed. + By default no transformation is performed. +- `allowmissing`: if `false` (the default) then an error will be thrown if `colkey` + contains `missing` values; if `true` then a column referring to `missing` value + will be created. +- allowduplicates`: if `false` (the default) then an error an error will be thrown + if combination of `rowkeys` and `colkey` contains duplicate entries; if `true` + then then the last encountered `value` will be retained. # Examples @@ -400,12 +400,15 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, renamecols=renamecols, allowmissing=allowmissing, allowduplicates=allowduplicates) - local g - try - g = groupby(df, rowkey_ints, sort=true) - catch - g = groupby(df, rowkey_ints, sort=false) + dosort = true + if !all(i -> isordered(df[!, i]), rowkey_ints) # avoid issorted in most cases + try + map(i -> issorted(df[!, i]), rowkey_ints) # this should be relatively cheap + catch + dosort = false + end end + g = groupby(df, rowkey_ints, sort=dosort) keycol = df[!, colkey] valuecol = df[!, value] return _unstack(df, rowkey_ints, index(df)[colkey], keycol, valuecol, g, @@ -681,4 +684,4 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; dest_namescol = src_namescol end return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) -end \ No newline at end of file +end From b5a7778dcdb5156c1dbb0fc5650bcc763464c762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 26 Oct 2020 20:55:45 +0100 Subject: [PATCH 07/20] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/reshape.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 9f1171b648..c945a4120b 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -343,16 +343,14 @@ function unstack(df::AbstractDataFrame, rowkey::ColumnIndex, colkey::ColumnIndex end function _unstack_preprocess_vector(v::AbstractVector) - v_unique = unique(v) - had_missing = any(ismissing, v_unique) - v_unique = intersect(levels(v), v_unique) - had_missing && (v_unique = vcat(v_unique, [missing])) + v_unique = intersect([levels(v); missing], unique(v)) len_v = length(v_unique) v_map = Dict([x => i for (i,x) in enumerate(v_unique)]) # both unique and Dict should use isequal to test for identity of values @assert length(v_map) == length(v_unique) # if there are no missings in v then set reference index of missing to 0 col = similar(v, length(v_unique)) + @assert firstindex(col) == 1 # we do not support e.g. OffsetArrays.jl yet copyto!(col, v_unique) return col, v_map, get(v_map, missing, 0) end From 2efe294070221b62b9f26f67d0d42669b8a67a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 27 Oct 2020 00:54:32 +0100 Subject: [PATCH 08/20] go for PooledVector implementation --- src/abstractdataframe/reshape.jl | 99 ++++++++++++++------------------ 1 file changed, 42 insertions(+), 57 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index c945a4120b..9b4a69f261 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -207,9 +207,8 @@ end Unstack data frame `df`, i.e. convert it from long to wide format. -Row keys and values from value column will be sorted by default unless they are -not ordered (i.e. passing them to `sort` fails) in which case the order of the -result is unspecified. +Row keys and values from value column will be ordered in the order of their +appearance in the respective vectors. # Positional arguments - `df` : the AbstractDataFrame to be unstacked @@ -336,56 +335,50 @@ function unstack(df::AbstractDataFrame, rowkey::ColumnIndex, colkey::ColumnIndex value::ColumnIndex; renamecols::Function=identity, allowmissing::Bool=false, allowduplicates::Bool=false) refkeycol = df[!, rowkey] + rowref, rowref_map = _unstack_preprocess_vector(refkeycol) keycol = df[!, colkey] + colref, colref_map = _unstack_preprocess_vector(keycol) valuecol = df[!, value] return _unstack(df, index(df)[rowkey], index(df)[colkey], - keycol, valuecol, refkeycol, renamecols, allowmissing, allowduplicates) + colref, colref_map, valuecol, rowref, rowref_map, + renamecols, allowmissing, allowduplicates) end function _unstack_preprocess_vector(v::AbstractVector) - v_unique = intersect([levels(v); missing], unique(v)) - len_v = length(v_unique) - v_map = Dict([x => i for (i,x) in enumerate(v_unique)]) - # both unique and Dict should use isequal to test for identity of values - @assert length(v_map) == length(v_unique) - # if there are no missings in v then set reference index of missing to 0 - col = similar(v, length(v_unique)) - @assert firstindex(col) == 1 # we do not support e.g. OffsetArrays.jl yet - copyto!(col, v_unique) - return col, v_map, get(v_map, missing, 0) + # make sure we re-allocate if v is already a PooledArray + vp = PooledArray{eltype(v)}(v) + return DataAPI.refarray(vp), DataAPI.refpool(vp) end function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, - keycol::AbstractVector, valuecol::AbstractVector, - refkeycol::AbstractVector, renamecols::Function, + colref::AbstractVector, colref_map::AbstractVector, valuecol::AbstractVector, + rowref::AbstractVector, rowref_map::AbstractVector, renamecols::Function, allowmissing::Bool, allowduplicates::Bool) - col, refkeycol_map, refkeycol_missing = _unstack_preprocess_vector(refkeycol) - Nrow = length(refkeycol_map) - colnames, keycol_map, keycol_missing = _unstack_preprocess_vector(keycol) - Ncol = length(keycol_map) + Nrow = length(rowref_map) + Ncol = length(colref_map) - if keycol_missing != 0 && !allowmissing + if any(ismissing, colref_map) && !allowmissing throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " * "Pass `allowmissing=true` to skip missings.")) end unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] mask_filled = falses(Nrow, Ncol) # has a given [row,col] entry been filled? - for k in 1:nrow(df) - kref = keycol_map[keycol[k]] - refkref = refkeycol_map[refkeycol[k]] - if !allowduplicates && mask_filled[refkref, kref] + + @assert length(rowref) == length(colref) == length(valuecol) + for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) + if !allowduplicates && mask_filled[row_id, col_id] throw(ArgumentError("Duplicate entries in unstack at row $k for key "* - "$(refkeycol[k]) and variable $(keycol[k]). " * + "$(rowref_map[row_id]) and variable $(colref_map[col_id]). " * "Pass allowduplicates=true to allow them.")) end - unstacked_val[kref][refkref] = valuecol[k] - mask_filled[refkref, kref] = true + unstacked_val[col_id][row_id] = val + mask_filled[row_id, col_id] = true end - # note that Symbol.(renamecols.(colnames)) must produce unique column names + # note that Symbol.(renamecols.(colref_map)) must produce unique column names # and _names(df)[rowkey] must also produce a unique name - df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colnames)), copycols=false) - return insertcols!(df2, 1, _names(df)[rowkey] => col, copycols=false) + df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false) + return insertcols!(df2, 1, _names(df)[rowkey] => rowref_map, copycols=false) end function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, @@ -398,19 +391,12 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, renamecols=renamecols, allowmissing=allowmissing, allowduplicates=allowduplicates) - dosort = true - if !all(i -> isordered(df[!, i]), rowkey_ints) # avoid issorted in most cases - try - map(i -> issorted(df[!, i]), rowkey_ints) # this should be relatively cheap - catch - dosort = false - end - end - g = groupby(df, rowkey_ints, sort=dosort) + g = groupby(df, rowkey_ints) keycol = df[!, colkey] + colref, colref_map = _unstack_preprocess_vector(keycol) valuecol = df[!, value] - return _unstack(df, rowkey_ints, index(df)[colkey], keycol, valuecol, g, - renamecols, allowmissing, allowduplicates) + return _unstack(df, rowkey_ints, index(df)[colkey], colref, colref_map, + valuecol, g, renamecols, allowmissing, allowduplicates) end function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex; @@ -429,43 +415,42 @@ unstack(df::AbstractDataFrame; renamecols::Function=identity, allowduplicates=allowduplicates) function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, - colkey::Int, keycol::AbstractVector, + colkey::Int, colref::AbstractVector, colref_map::AbstractVector, valuecol::AbstractVector, g::GroupedDataFrame, renamecols::Function, allowmissing::Bool, allowduplicates::Bool) - idx, starts, ends = g.idx, g.starts, g.ends + idx::Vector{Int}, starts::Vector{Int}, ends::Vector{Int} = g.idx, g.starts, g.ends groupidxs = [idx[starts[i]:ends[i]] for i in 1:length(starts)] - rowkey = zeros(Int, size(df, 1)) + rowref = zeros(Int, size(df, 1)) for i in 1:length(groupidxs) - rowkey[groupidxs[i]] .= i + rowref[groupidxs[i]] .= i end df1 = df[idx[starts], g.cols] Nrow = length(g) - colnames, keycol_map, keycol_missing = _unstack_preprocess_vector(keycol) - Ncol = length(keycol_map) + Ncol = length(colref_map) - if keycol_missing != 0 && !allowmissing + if any(ismissing, colref_map) && !allowmissing throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." * " Pass `allowmissing=true` to skip missings.")) end unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] mask_filled = falses(Nrow, Ncol) - for k in 1:nrow(df) - kref = keycol_map[keycol[k]] - i = rowkey[k] - if !allowduplicates && mask_filled[i, kref] + + @assert length(rowref) == length(colref) == length(valuecol) + for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) + if !allowduplicates && mask_filled[row_id, col_id] throw(ArgumentError("Duplicate entries in unstack at row $k for key "* - "$(tuple((df[k,s] for s in rowkeys)...)) and variable $(keycol[k]). " * + "$(tuple((df[k,s] for s in rowkeys)...)) and variable $(colref_map[col_id]). " * "Pass allowduplicates=true to allow them.")) end - unstacked_val[kref][i] = valuecol[k] - mask_filled[i, kref] = true + unstacked_val[col_id][row_id] = val + mask_filled[row_id, col_id] = true end # note that Symbol.(renamecols.(colnames)) must produce unique column names # and names between df1 and df2 must be unique - df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colnames)), copycols=false) + df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false) hcat(df1, df2, copycols=false) end From 3d2ae40942adf24c6493a34730c4f947b86d2432 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 27 Oct 2020 01:13:43 +0100 Subject: [PATCH 09/20] proper handling of CategoricalArrays.jl --- src/abstractdataframe/reshape.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 9b4a69f261..5f2bd0068a 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -347,7 +347,11 @@ end function _unstack_preprocess_vector(v::AbstractVector) # make sure we re-allocate if v is already a PooledArray vp = PooledArray{eltype(v)}(v) - return DataAPI.refarray(vp), DataAPI.refpool(vp) + rp = DataAPI.refpool(vp) + pool = similar(v, length(rp)) + @assert firstindex(pool) == 1 + copyto!(pool, rp) + return DataAPI.refarray(vp), pool end function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, From cf434a7944edb49e68c47becdcffcc5a2b21cda0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 27 Oct 2020 14:12:46 +0100 Subject: [PATCH 10/20] remove unsupported functionality --- src/abstractdataframe/reshape.jl | 7 ------- test/reshape.jl | 20 -------------------- 2 files changed, 27 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 5f2bd0068a..caa96dc986 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -563,13 +563,6 @@ Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(parent(v)), v.inner, v. Base.similar(v::RepeatedVector, T::Type, dims::Dims) = similar(parent(v), T, dims) Base.unique(v::RepeatedVector) = unique(parent(v)) -# TODO: @nalimilan: is there a generic way to support this? -# function CategoricalArrays.CategoricalArray(v::RepeatedVector) -# res = CategoricalArray(parent(v), levels=levels(parent(v))) -# res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) -# res -# end - Base.transpose(::AbstractDataFrame, args...; kwargs...) = MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") diff --git a/test/reshape.jl b/test/reshape.jl index 30e4a3908b..4a2cb6ad1b 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -420,26 +420,6 @@ end @test_throws ArgumentError flatten(df_bad, [:b, :c]) end -# TODO: uncomment these tests when we improve handling of categorical here - -# @testset "test RepeatedVector for categorical" begin -# v = categorical(["a", "b", "c"], ordered=true) -# levels!(v, ["b", "c", "a"]) -# rv = DataFrames.RepeatedVector(v, 1, 1) -# @test isordered(rv) -# @test isordered(categorical(rv)) -# @test levels(rv) == ["b", "c", "a"] -# @test levels(categorical(rv)) == ["b", "c", "a"] - -# v = categorical(["a", "b", "c"]) -# levels!(v, ["b", "c", "a"]) -# rv = DataFrames.RepeatedVector(v, 1, 1) -# @test !isordered(rv) -# @test !isordered(categorical(rv)) -# @test levels(rv) == ["b", "c", "a"] -# @test levels(categorical(rv)) == ["b", "c", "a"] -# end - @testset "stack categorical test" begin Random.seed!(1234) d1 = DataFrame(a = repeat([1:3;], inner = [4]), From f4e6a0155ef56f88cf08ae27674d1874f8b88e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 27 Oct 2020 19:04:00 +0100 Subject: [PATCH 11/20] remove DataAPI support for RepeatedVector --- src/abstractdataframe/reshape.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index caa96dc986..973a19cdec 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -543,11 +543,6 @@ struct RepeatedVector{T} <: AbstractVector{T} end Base.parent(v::RepeatedVector) = v.parent -DataAPI.levels(v::RepeatedVector) = levels(parent(v)) - -# TODO: uncomment when DataAPI.jl supports this -# DataAPI.isordered(v::RepeatedVector) = -# isordered(parent(v)) function Base.getindex(v::RepeatedVector, i::Int) N = length(parent(v)) From a8f5f9be9f8a73076996676b7924582691f11ac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 27 Oct 2020 19:21:22 +0100 Subject: [PATCH 12/20] use groupby everywhere --- src/abstractdataframe/reshape.jl | 101 ++++++++----------------------- 1 file changed, 25 insertions(+), 76 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 973a19cdec..4d0f70dc4b 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -331,76 +331,17 @@ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) ``` Note that there are some differences between the widened results above. """ -function unstack(df::AbstractDataFrame, rowkey::ColumnIndex, colkey::ColumnIndex, - value::ColumnIndex; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false) - refkeycol = df[!, rowkey] - rowref, rowref_map = _unstack_preprocess_vector(refkeycol) - keycol = df[!, colkey] - colref, colref_map = _unstack_preprocess_vector(keycol) - valuecol = df[!, value] - return _unstack(df, index(df)[rowkey], index(df)[colkey], - colref, colref_map, valuecol, rowref, rowref_map, - renamecols, allowmissing, allowduplicates) -end - -function _unstack_preprocess_vector(v::AbstractVector) - # make sure we re-allocate if v is already a PooledArray - vp = PooledArray{eltype(v)}(v) - rp = DataAPI.refpool(vp) - pool = similar(v, length(rp)) - @assert firstindex(pool) == 1 - copyto!(pool, rp) - return DataAPI.refarray(vp), pool -end - -function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, - colref::AbstractVector, colref_map::AbstractVector, valuecol::AbstractVector, - rowref::AbstractVector, rowref_map::AbstractVector, renamecols::Function, - allowmissing::Bool, allowduplicates::Bool) - Nrow = length(rowref_map) - Ncol = length(colref_map) - - if any(ismissing, colref_map) && !allowmissing - throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " * - "Pass `allowmissing=true` to skip missings.")) - end - - unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] - mask_filled = falses(Nrow, Ncol) # has a given [row,col] entry been filled? - - @assert length(rowref) == length(colref) == length(valuecol) - for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) - if !allowduplicates && mask_filled[row_id, col_id] - throw(ArgumentError("Duplicate entries in unstack at row $k for key "* - "$(rowref_map[row_id]) and variable $(colref_map[col_id]). " * - "Pass allowduplicates=true to allow them.")) - end - unstacked_val[col_id][row_id] = val - mask_filled[row_id, col_id] = true - end - # note that Symbol.(renamecols.(colref_map)) must produce unique column names - # and _names(df)[rowkey] must also produce a unique name - df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false) - return insertcols!(df2, 1, _names(df)[rowkey] => rowref_map, copycols=false) -end - function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, value::ColumnIndex; renamecols::Function=identity, allowmissing::Bool=false, allowduplicates::Bool=false) - rowkey_ints = index(df)[rowkeys] + rowkey_ints = vcat(index(df)[rowkeys]) @assert rowkey_ints isa AbstractVector{Int} length(rowkey_ints) == 0 && throw(ArgumentError("No key column found")) - length(rowkey_ints) == 1 && return unstack(df, rowkey_ints[1], colkey, value, - renamecols=renamecols, - allowmissing=allowmissing, - allowduplicates=allowduplicates) - g = groupby(df, rowkey_ints) - keycol = df[!, colkey] - colref, colref_map = _unstack_preprocess_vector(keycol) + g_rowkey = groupby(df, rowkey_ints) + g_colkey = groupby(df, colkey) valuecol = df[!, value] - return _unstack(df, rowkey_ints, index(df)[colkey], colref, colref_map, - valuecol, g, renamecols, allowmissing, allowduplicates) + return _unstack(df, rowkey_ints, index(df)[colkey], g_colkey, + valuecol, g_rowkey, renamecols, allowmissing, allowduplicates) end function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex; @@ -418,21 +359,29 @@ unstack(df::AbstractDataFrame; renamecols::Function=identity, unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing, allowduplicates=allowduplicates) -function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, - colkey::Int, colref::AbstractVector, colref_map::AbstractVector, - valuecol::AbstractVector, g::GroupedDataFrame, - renamecols::Function, - allowmissing::Bool, allowduplicates::Bool) +function getrefs(g::GroupedDataFrame) idx::Vector{Int}, starts::Vector{Int}, ends::Vector{Int} = g.idx, g.starts, g.ends groupidxs = [idx[starts[i]:ends[i]] for i in 1:length(starts)] - rowref = zeros(Int, size(df, 1)) + ref = zeros(Int, size(parent(g), 1)) for i in 1:length(groupidxs) - rowref[groupidxs[i]] .= i + ref[groupidxs[i]] .= i end - df1 = df[idx[starts], g.cols] - Nrow = length(g) + return ref +end - Ncol = length(colref_map) +function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, + colkey::Int, g_colkey::GroupedDataFrame, + valuecol::AbstractVector, g_rowkey::GroupedDataFrame, + renamecols::Function, + allowmissing::Bool, allowduplicates::Bool) + rowref = getrefs(g_rowkey) + df1 = df[g_rowkey.idx[g_rowkey.starts], g_rowkey.cols] + Nrow = length(g_rowkey) + + @assert groupcols(g_colkey) == _names(df)[colkey:colkey] + colref = getrefs(g_colkey) + Ncol = length(g_colkey) + colref_map = df[g_colkey.starts, colkey] if any(ismissing, colref_map) && !allowmissing throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." * @@ -446,13 +395,13 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) if !allowduplicates && mask_filled[row_id, col_id] throw(ArgumentError("Duplicate entries in unstack at row $k for key "* - "$(tuple((df[k,s] for s in rowkeys)...)) and variable $(colref_map[col_id]). " * + "$(tuple((df[k, s] for s in rowkeys)...)) and variable $(colref_map[col_id]). " * "Pass allowduplicates=true to allow them.")) end unstacked_val[col_id][row_id] = val mask_filled[row_id, col_id] = true end - # note that Symbol.(renamecols.(colnames)) must produce unique column names + # note that Symbol.(renamecols.(colref_map)) must produce unique column names # and names between df1 and df2 must be unique df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false) hcat(df1, df2, copycols=false) From 585ae07b6877ee11ecb4f370abff29d70bec5e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 28 Oct 2020 00:12:10 +0100 Subject: [PATCH 13/20] further performance improvements --- NEWS.md | 4 ++-- src/abstractdataframe/reshape.jl | 33 +++++++++++++++++++++----------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2111aca430..20fdbf398f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -41,8 +41,8 @@ * in `describe` the specification of custom aggregation is now `function => name`; old `name => function` order is now deprecated ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401)) -* `unstack` now does not depend on CategoricalArrays.jl and has two new keyword - arguments `allowmissing` and `allowduplicates` +* `unstack` now does produce rows and columns in their order of appereance in the source + and has two new keyword arguments `allowmissing` and `allowduplicates` ([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494)) ## New functionalities diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 4d0f70dc4b..ece5be7ebf 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -359,14 +359,25 @@ unstack(df::AbstractDataFrame; renamecols::Function=identity, unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing, allowduplicates=allowduplicates) -function getrefs(g::GroupedDataFrame) - idx::Vector{Int}, starts::Vector{Int}, ends::Vector{Int} = g.idx, g.starts, g.ends - groupidxs = [idx[starts[i]:ends[i]] for i in 1:length(starts)] - ref = zeros(Int, size(parent(g), 1)) - for i in 1:length(groupidxs) - ref[groupidxs[i]] .= i +# we take into account the fact that idx, starts and ends are computed lazily +# so we rather directly reference the gdf.groups +# this function is tailor made for unstack so it does assume that no groups were +# dropped (i.e. gdf.groups does not contain 0 entries) +function find_group_row(gdf::GroupedDataFrame) + rows = zeros(Int, length(gdf)) + isempty(rows) && return rows + + filled = 0 + i = 1 + while filled < length(gdf) + group = gdf.groups[i] + if rows[group] == 0 + rows[group] = i + filled += 1 + end + i += 1 end - return ref + return rows # return row index of first occurence of each group in gdf end function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, @@ -374,14 +385,14 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, valuecol::AbstractVector, g_rowkey::GroupedDataFrame, renamecols::Function, allowmissing::Bool, allowduplicates::Bool) - rowref = getrefs(g_rowkey) - df1 = df[g_rowkey.idx[g_rowkey.starts], g_rowkey.cols] + rowref = g_rowkey.groups + df1 = df[find_group_row(g_rowkey), g_rowkey.cols] Nrow = length(g_rowkey) @assert groupcols(g_colkey) == _names(df)[colkey:colkey] - colref = getrefs(g_colkey) + colref = g_colkey.groups Ncol = length(g_colkey) - colref_map = df[g_colkey.starts, colkey] + colref_map = df[find_group_row(g_colkey), colkey] if any(ismissing, colref_map) && !allowmissing throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." * From 94896f936210f3374744bc5b58a42eb4ab8c0af8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 28 Oct 2020 00:27:31 +0100 Subject: [PATCH 14/20] start implementing tests --- NEWS.md | 2 +- src/abstractdataframe/reshape.jl | 4 ++-- test/reshape.jl | 31 +++++++++++++++---------------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index 20fdbf398f..d8cb3931bd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -41,7 +41,7 @@ * in `describe` the specification of custom aggregation is now `function => name`; old `name => function` order is now deprecated ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401)) -* `unstack` now does produce rows and columns in their order of appereance in the source +* `unstack` now does produce rows and columns in the order which `groupby` produces and has two new keyword arguments `allowmissing` and `allowduplicates` ([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494)) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index ece5be7ebf..f1b246c634 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -207,8 +207,8 @@ end Unstack data frame `df`, i.e. convert it from long to wide format. -Row keys and values from value column will be ordered in the order of their -appearance in the respective vectors. +Row keys and values from value column will be ordered in the order produced +by `groupby`. # Positional arguments - `df` : the AbstractDataFrame to be unstacked diff --git a/test/reshape.jl b/test/reshape.jl index 4a2cb6ad1b..7c67fca029 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -52,10 +52,9 @@ const ≅ = isequal df2 = unstack(df, :Fish, :Key, :Value) #Unstack without specifying a row column df3 = unstack(df, :Key, :Value) - #The expected output, XXX level should be dropped as it has no rows with this key - df4 = DataFrame(Fish = ["Batman", "Bob"], - Color = ["Grey", "Red"], - Mass = ["18 g", "12 g"]) + df4 = DataFrame(Fish = ["Bob", "Batman"], + Mass = ["12 g", "18 g"], + Color = ["Red", "Grey"]) @test df2 ≅ df4 @test typeof(df2[!, :Fish]) <: Vector{String} # first column stays as CategoricalArray in df3 @@ -66,7 +65,7 @@ const ≅ = isequal df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result allowmissing!(df4, :Mass) - df4[2, :Mass] = missing + df4[1, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"], @@ -74,9 +73,9 @@ const ≅ = isequal Value = ["12 g", "Red", "18 g", "Grey"]) df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) - df4 = DataFrame(Fish = ["Batman", "Bob"], - _COLOR_ = ["Grey", "Red"], - _MASS_ = ["18 g", "12 g"]) + df4 = DataFrame(Fish = ["Bob", "Batman"], + _MASS_ = ["12 g", "18 g"], + _COLOR_ = ["Red", "Grey"]) @test df2 == df4 @test df3 == df4 @@ -90,10 +89,10 @@ const ≅ = isequal # test missing value in grouping variable mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4) - @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] - @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] - @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] - @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] + @test unstack(stack(mdf, Not(:id)), :id, :variable, :value) ≅ mdf + @test unstack(stack(mdf, Not(1)), :id, :variable, :value) ≅ mdf + @test unstack(stack(mdf, Not(:id)), :id, :variable, :value) ≅ mdf + @test unstack(stack(mdf, Not(1)), :id, :variable, :value) ≅ mdf # test more than one grouping column wide = DataFrame(id = 1:12, @@ -169,8 +168,8 @@ end @test_throws ArgumentError unstack(df, :variable, :value) @test_throws ArgumentError unstack(df, :variable, :value, allowmissing=true) udf = unstack(df, :variable, :value, allowmissing=true, renamecols=x -> coalesce(x, "MISSING")) - @test propertynames(udf) == [:id, :a, :b, :missing, :MISSING] - @test udf[!, :missing] ≅ [missing, 9.0, missing] + @test propertynames(udf) == [:id, :a, :b, :MISSING, :missing] + @test udf[!, :missing] ≅ [missing, missing, 9.0] @test udf[!, :MISSING] ≅ [3.0, missing, missing] df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2], @@ -181,8 +180,8 @@ end @test_throws ArgumentError unstack(df, 3, 4, allowmissing=true) udf = unstack(df, 3, 4, allowmissing=true, renamecols=x -> coalesce(x, "MISSING")) - @test propertynames(udf) == [:id, :id2, :a, :b, :missing, :MISSING] - @test udf[!, :missing] ≅ [missing, 9.0, missing] + @test propertynames(udf) == [:id, :id2, :a, :b, :MISSING, :missing] + @test udf[!, :missing] ≅ [missing, missing, 9.0] @test udf[!, :MISSING] ≅ [3.0, missing, missing] end From c80d2dfd541c7008c48facd5de995af7bdc36fe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 28 Oct 2020 09:45:38 +0100 Subject: [PATCH 15/20] improve test coverage --- test/reshape.jl | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/test/reshape.jl b/test/reshape.jl index 7c67fca029..2be79b7ba6 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -498,12 +498,36 @@ end @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) == DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + # an exertice on current unstack invariants Random.seed!(1234) - for i in 1:3 + for i in 1:16 df = df[Random.shuffle(1:9), :] - @test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) - @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) == + @test unstack(df, :id, :var, :val)[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] == + DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) + @test unstack(df, :var, :val)[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] == DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + + df2 = copy(df) + df2.id = PooledArray(df.id) + df2.var = PooledArray(df.var) + @test unstack(df2, :id, :var, :val)[sortperm(df2.id.pool), [1; 1 .+ sortperm(df2.var.pool)]] == + DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) + @test unstack(df2, :var, :val)[sortperm(df2.id.pool), [1:2; 2 .+ sortperm(df2.var.pool)]] == + DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + + df2 = categorical(df, 1:3) + @test unstack(df2, :id, :var, :val) == + DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) == + DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + levels!(df2.id, [10, 2, 11, 3, 1, 12]) + levels!(df2.var, ['x', 'b', 'y', 'c', 'a', 'z']) + @test unstack(df2, :id, :var, :val) == + DataFrame(id=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :] + @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) == + DataFrame(id=1:3, id2=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :] end df = DataFrame(id=repeat(1:3, inner=3), From 8f235480f9fdf4134fa2680a6dbe552b17d3847a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 29 Oct 2020 23:40:56 +0100 Subject: [PATCH 16/20] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/reshape.jl | 5 +++-- test/reshape.jl | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index f1b246c634..3c612b6bef 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -369,15 +369,16 @@ function find_group_row(gdf::GroupedDataFrame) filled = 0 i = 1 + groups = gdf.groups while filled < length(gdf) - group = gdf.groups[i] + group = groups[i] if rows[group] == 0 rows[group] = i filled += 1 end i += 1 end - return rows # return row index of first occurence of each group in gdf + return rows # return row index of first occurrence of each group in gdf end function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, diff --git a/test/reshape.jl b/test/reshape.jl index 2be79b7ba6..95dd719f49 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -498,7 +498,7 @@ end @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) == DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) - # an exertice on current unstack invariants + # an exercise on current unstack invariants Random.seed!(1234) for i in 1:16 df = df[Random.shuffle(1:9), :] From 58cc578ec41cbf05aa092c890b6e88f532494db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 29 Oct 2020 23:46:07 +0100 Subject: [PATCH 17/20] improve docstring --- src/abstractdataframe/reshape.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 3c612b6bef..96303872a4 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -207,8 +207,10 @@ end Unstack data frame `df`, i.e. convert it from long to wide format. -Row keys and values from value column will be ordered in the order produced -by `groupby`. +Row and column keys will be ordered in the order of their first appearance except +when they are stored in an `AbstractVector` which supports `DataAPI.refpool` +(two most common cases are `CategoricalVector` and `PooledVector`), +in which case the odrer follows the order of values in this pool. # Positional arguments - `df` : the AbstractDataFrame to be unstacked From 35b1f0a9fd3b91c1149393cfc2ca3cecefb811a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 30 Oct 2020 13:00:20 +0100 Subject: [PATCH 18/20] make unstack always use order of first appereance --- src/abstractdataframe/reshape.jl | 32 +++++++++++++++++------- test/reshape.jl | 43 ++++++++++++++++---------------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 96303872a4..e97fb005fa 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -15,7 +15,6 @@ If `view=true` then return a stacked view of a data frame (long format). The result is a view because the columns are special `AbstractVectors` that return views into the original data frame. - # Arguments - `df` : the AbstractDataFrame to be stacked - `measure_vars` : the columns to be stacked (the measurement variables), @@ -207,10 +206,7 @@ end Unstack data frame `df`, i.e. convert it from long to wide format. -Row and column keys will be ordered in the order of their first appearance except -when they are stored in an `AbstractVector` which supports `DataAPI.refpool` -(two most common cases are `CategoricalVector` and `PooledVector`), -in which case the odrer follows the order of values in this pool. +Row and column keys will be ordered in the order of their first appearance. # Positional arguments - `df` : the AbstractDataFrame to be unstacked @@ -380,7 +376,7 @@ function find_group_row(gdf::GroupedDataFrame) end i += 1 end - return rows # return row index of first occurrence of each group in gdf + return rows # return row index of first occurrence of each group in gdf.groups end function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, @@ -389,13 +385,14 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, renamecols::Function, allowmissing::Bool, allowduplicates::Bool) rowref = g_rowkey.groups - df1 = df[find_group_row(g_rowkey), g_rowkey.cols] + row_group_row_idxs = find_group_row(g_rowkey) Nrow = length(g_rowkey) @assert groupcols(g_colkey) == _names(df)[colkey:colkey] colref = g_colkey.groups Ncol = length(g_colkey) - colref_map = df[find_group_row(g_colkey), colkey] + col_group_row_idxs = find_group_row(g_colkey) + colref_map = df[col_group_row_idxs, colkey] if any(ismissing, colref_map) && !allowmissing throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." * @@ -415,10 +412,27 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, unstacked_val[col_id][row_id] = val mask_filled[row_id, col_id] = true end + # note that Symbol.(renamecols.(colref_map)) must produce unique column names # and names between df1 and df2 must be unique + df1 = df[row_group_row_idxs, g_rowkey.cols] df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false) - hcat(df1, df2, copycols=false) + + @assert length(col_group_row_idxs) == ncol(df2) + # avoid reordering when col_group_row_idxs was already ordered + if !issorted(col_group_row_idxs) + df2 = df2[!, sortperm(col_group_row_idxs)] + end + + res_df = hcat(df1, df2, copycols=false) + + @assert length(row_group_row_idxs) == nrow(res_df) + # avoid reordering when col_group_row_idxs was already ordered + if !issorted(row_group_row_idxs) + res_df = res_df[sortperm(row_group_row_idxs), :] + end + + return res_df end """ diff --git a/test/reshape.jl b/test/reshape.jl index 95dd719f49..70419250d7 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -16,10 +16,10 @@ const ≅ = isequal @test levels(df[!, 2]) == ["YYY", "Color", "Mass"] # make sure we did not mess df[!, 2] levels #Unstack without specifying a row column df3 = unstack(df, :Key, :Value) - #The expected output, XXX level should be dropped as it has no rows with this key + #The expected output is in odred of appereance df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"], - Color = Union{String, Missing}["Red", "Grey"], - Mass = Union{String, Missing}["12 g", "18 g"]) + Mass = Union{String, Missing}["12 g", "18 g"], + Color = Union{String, Missing}["Red", "Grey"]) @test df2 ≅ df4 @test typeof(df2[!, :Fish]) <: CategoricalVector{Union{String, Missing}} # first column stays as CategoricalArray in df3 @@ -39,8 +39,8 @@ const ≅ = isequal df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"], - _COLOR_ = Union{String, Missing}["Red", "Grey"], - _MASS_ = Union{String, Missing}["12 g", "18 g"]) + _MASS_ = Union{String, Missing}["12 g", "18 g"], + _COLOR_ = Union{String, Missing}["Red", "Grey"]) @test df2 == df4 @test df3 == df4 @@ -498,36 +498,35 @@ end @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) == DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) - # an exercise on current unstack invariants + # make sure we always use order of appereance Random.seed!(1234) for i in 1:16 df = df[Random.shuffle(1:9), :] - @test unstack(df, :id, :var, :val)[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] == + wide1 = unstack(df, :id, :var, :val) + wide2 = unstack(df, [:id, :id2], :var, :val) + wide3 = unstack(df, :var, :val) + @test wide1[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) - @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) - @test unstack(df, :var, :val)[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] == + @test wide2[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] == DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test wide2 == wide3 df2 = copy(df) df2.id = PooledArray(df.id) df2.var = PooledArray(df.var) - @test unstack(df2, :id, :var, :val)[sortperm(df2.id.pool), [1; 1 .+ sortperm(df2.var.pool)]] == - DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) - @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) - @test unstack(df2, :var, :val)[sortperm(df2.id.pool), [1:2; 2 .+ sortperm(df2.var.pool)]] == - DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df2, :id, :var, :val) == wide1 + @test unstack(df2, [:id, :id2], :var, :val) == wide2 + @test unstack(df2, :var, :val) == wide3 df2 = categorical(df, 1:3) - @test unstack(df2, :id, :var, :val) == - DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) - @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) == - DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df2, :id, :var, :val) == wide1 + @test unstack(df2, [:id, :id2], :var, :val) == wide2 + @test unstack(df2, :var, :val) == wide3 levels!(df2.id, [10, 2, 11, 3, 1, 12]) levels!(df2.var, ['x', 'b', 'y', 'c', 'a', 'z']) - @test unstack(df2, :id, :var, :val) == - DataFrame(id=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :] - @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) == - DataFrame(id=1:3, id2=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :] + @test unstack(df2, :id, :var, :val) == wide1 + @test unstack(df2, [:id, :id2], :var, :val) == wide2 + @test unstack(df2, :var, :val) == wide3 end df = DataFrame(id=repeat(1:3, inner=3), From 8b6c14f54a545892b52417817181dbb95bed8cb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 30 Oct 2020 16:08:32 +0100 Subject: [PATCH 19/20] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- test/reshape.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/reshape.jl b/test/reshape.jl index 70419250d7..480bb2d069 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -16,7 +16,7 @@ const ≅ = isequal @test levels(df[!, 2]) == ["YYY", "Color", "Mass"] # make sure we did not mess df[!, 2] levels #Unstack without specifying a row column df3 = unstack(df, :Key, :Value) - #The expected output is in odred of appereance + # The expected output is in order of appearance df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"], Mass = Union{String, Missing}["12 g", "18 g"], Color = Union{String, Missing}["Red", "Grey"]) @@ -500,6 +500,7 @@ end # make sure we always use order of appereance Random.seed!(1234) + # Use a large value to test several orders of appearance for i in 1:16 df = df[Random.shuffle(1:9), :] wide1 = unstack(df, :id, :var, :val) From 44caaf511deb37d39b0c86418bd1ae1a19a3be0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 31 Oct 2020 18:17:54 +0100 Subject: [PATCH 20/20] minor documentation cleanup --- NEWS.md | 2 +- docs/src/man/reshaping_and_pivoting.md | 33 ++++++--- src/abstractdataframe/reshape.jl | 96 +++++++++++++------------- 3 files changed, 73 insertions(+), 58 deletions(-) diff --git a/NEWS.md b/NEWS.md index d8cb3931bd..43ac61d13d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -41,7 +41,7 @@ * in `describe` the specification of custom aggregation is now `function => name`; old `name => function` order is now deprecated ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401)) -* `unstack` now does produce rows and columns in the order which `groupby` produces +* `unstack` now produces row and column keys in the order of their first appearance and has two new keyword arguments `allowmissing` and `allowduplicates` ([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494)) diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 7107f7184f..d75237bc1e 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -58,7 +58,9 @@ julia> last(d, 6) │ 6 │ Iris-virginica │ PetalWidth │ 1.8 │ ``` -The second optional argument to `stack` indicates the columns to be stacked. These are normally referred to as the measured variables. Column names can also be given: +The second optional argument to `stack` indicates the columns to be stacked. +These are normally referred to as the measured variables. Column names can also +be given: ```jldoctest reshape julia> d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]); @@ -88,11 +90,18 @@ julia> last(d, 6) │ 6 │ Iris-virginica │ PetalWidth │ 1.8 │ ``` -Note that all columns can be of different types. Type promotion follows the rules of `vcat`. +Note that all columns can be of different types. Type promotion follows the +rules of `vcat`. -The stacked `DataFrame` that results includes all of the columns not specified to be stacked. These are repeated for each stacked column. These are normally refered to as identifier (id) columns. In addition to the id columns, two additional columns labeled `:variable` and `:values` contain the column identifier and the stacked columns. +The stacked `DataFrame` that results includes all of the columns not specified +to be stacked. These are repeated for each stacked column. These are normally +refered to as identifier (id) columns. In addition to the id columns, two +additional columns labeled `:variable` and `:values` contain the column +identifier and the stacked columns. -A third optional argument to `stack` represents the id columns that are repeated. This makes it easier to specify which variables you want included in the long format: +A third optional argument to `stack` represents the id columns that are +repeated. This makes it easier to specify which variables you want included in +the long format: ```jldoctest reshape julia> d = stack(iris, [:SepalLength, :SepalWidth], :Species); @@ -152,7 +161,9 @@ julia> last(d, 6) │ 6 │ Iris-virginica │ PetalWidth │ 1.8 │ ``` -`unstack` converts from a long format to a wide format. The default is requires specifying which columns are an id variable, column variable names, and column values: +`unstack` converts from a long format to a wide format. +The default is requires specifying which columns are an id variable, +column variable names, and column values: ```jldoctest reshape julia> iris.id = 1:size(iris, 1) @@ -267,7 +278,8 @@ julia> last(widedf, 6) │ 6 │ Iris-virginica │ 150 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ ``` -You can even skip passing the `:variable` and `:value` values as positional arguments, as they will be used by default, and write: +You can even skip passing the `:variable` and `:value` values as positional +arguments, as they will be used by default, and write: ```jldoctest reshape julia> widedf = unstack(longdf); @@ -296,7 +308,8 @@ julia> last(widedf, 6) │ 6 │ Iris-virginica │ 150 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ ``` -Passing `view=true` to `stack` returns a data frame whose columns are views into the original wide data frame. Here is an example: +Passing `view=true` to `stack` returns a data frame whose columns are views into +the original wide data frame. Here is an example: ```jldoctest reshape julia> d = stack(iris, view=true); @@ -337,7 +350,9 @@ This is provides a view of the original columns stacked together. Id columns -- `RepeatedVector` This repeats the original columns N times where N is the number of columns stacked. -None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example: +None of these reshaping functions perform any aggregation. To do aggregation, +use the split-apply-combine functions in combination with reshaping. Here is an +example: ```jldoctest reshape julia> using Statistics @@ -356,7 +371,7 @@ julia> first(d, 6) │ 5 │ Iris-setosa │ SepalLength │ 5.0 │ │ 6 │ Iris-setosa │ SepalLength │ 5.4 │ -julia> x = by(d, [:variable, :Species], :value => mean => :vsum); +julia> x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum); julia> first(x, 6) │ Row │ variable │ Species │ vsum │ diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index e97fb005fa..4d5dd80df8 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -242,90 +242,90 @@ julia> wide = DataFrame(id = 1:6, │ Row │ id │ a │ b │ c │ d │ │ │ Int64 │ Int64 │ Int64 │ Float64 │ Float64 │ ├─────┼───────┼───────┼───────┼───────────┼───────────┤ -│ 1 │ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ -│ 2 │ 2 │ 1 │ 1 │ -0.917794 │ 0.940007 │ -│ 3 │ 3 │ 2 │ 1 │ 0.309629 │ 0.820397 │ -│ 4 │ 4 │ 2 │ 2 │ 1.46677 │ -1.03457 │ -│ 5 │ 5 │ 3 │ 2 │ 1.04339 │ -0.770464 │ -│ 6 │ 6 │ 3 │ 2 │ -0.172475 │ -2.81039 │ +│ 1 │ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 2 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ 2 │ -0.270569 │ -1.07331 │ julia> long = stack(wide) 12×5 DataFrame │ Row │ id │ a │ b │ variable │ value │ │ │ Int64 │ Int64 │ Int64 │ String │ Float64 │ ├─────┼───────┼───────┼───────┼──────────┼───────────┤ -│ 1 │ 1 │ 1 │ 1 │ c │ 1.20649 │ -│ 2 │ 2 │ 1 │ 1 │ c │ -0.917794 │ -│ 3 │ 3 │ 2 │ 1 │ c │ 0.309629 │ -│ 4 │ 4 │ 2 │ 2 │ c │ 1.46677 │ -│ 5 │ 5 │ 3 │ 2 │ c │ 1.04339 │ -│ 6 │ 6 │ 3 │ 2 │ c │ -0.172475 │ -│ 7 │ 1 │ 1 │ 1 │ d │ -1.27628 │ -│ 8 │ 2 │ 1 │ 1 │ d │ 0.940007 │ -│ 9 │ 3 │ 2 │ 1 │ d │ 0.820397 │ -│ 10 │ 4 │ 2 │ 2 │ d │ -1.03457 │ -│ 11 │ 5 │ 3 │ 2 │ d │ -0.770464 │ -│ 12 │ 6 │ 3 │ 2 │ d │ -2.81039 │ +│ 1 │ 1 │ 1 │ 1 │ c │ -1.07327 │ +│ 2 │ 2 │ 1 │ 1 │ c │ -0.334919 │ +│ 3 │ 3 │ 2 │ 1 │ c │ 1.73213 │ +│ 4 │ 4 │ 2 │ 2 │ c │ 0.883706 │ +│ 5 │ 5 │ 3 │ 2 │ c │ 0.919183 │ +│ 6 │ 6 │ 3 │ 2 │ c │ -0.270569 │ +│ 7 │ 1 │ 1 │ 1 │ d │ -0.948501 │ +│ 8 │ 2 │ 1 │ 1 │ d │ 1.00158 │ +│ 9 │ 3 │ 2 │ 1 │ d │ -0.97692 │ +│ 10 │ 4 │ 2 │ 2 │ d │ -2.15281 │ +│ 11 │ 5 │ 3 │ 2 │ d │ -0.700637 │ +│ 12 │ 6 │ 3 │ 2 │ d │ -1.07331 │ julia> unstack(long) 6×5 DataFrame │ Row │ id │ a │ b │ c │ d │ │ │ Int64 │ Int64 │ Int64 │ Float64? │ Float64? │ ├─────┼───────┼───────┼───────┼───────────┼───────────┤ -│ 1 │ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ -│ 2 │ 2 │ 1 │ 1 │ -0.917794 │ 0.940007 │ -│ 3 │ 3 │ 2 │ 1 │ 0.309629 │ 0.820397 │ -│ 4 │ 4 │ 2 │ 2 │ 1.46677 │ -1.03457 │ -│ 5 │ 5 │ 3 │ 2 │ 1.04339 │ -0.770464 │ -│ 6 │ 6 │ 3 │ 2 │ -0.172475 │ -2.81039 │ +│ 1 │ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 2 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ 2 │ -0.270569 │ -1.07331 │ julia> unstack(long, :variable, :value) 6×5 DataFrame │ Row │ id │ a │ b │ c │ d │ │ │ Int64 │ Int64 │ Int64 │ Float64? │ Float64? │ ├─────┼───────┼───────┼───────┼───────────┼───────────┤ -│ 1 │ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ -│ 2 │ 2 │ 1 │ 1 │ -0.917794 │ 0.940007 │ -│ 3 │ 3 │ 2 │ 1 │ 0.309629 │ 0.820397 │ -│ 4 │ 4 │ 2 │ 2 │ 1.46677 │ -1.03457 │ -│ 5 │ 5 │ 3 │ 2 │ 1.04339 │ -0.770464 │ -│ 6 │ 6 │ 3 │ 2 │ -0.172475 │ -2.81039 │ +│ 1 │ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 2 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ 2 │ -0.270569 │ -1.07331 │ julia> unstack(long, :id, :variable, :value) 6×3 DataFrame │ Row │ id │ c │ d │ │ │ Int64 │ Float64? │ Float64? │ ├─────┼───────┼───────────┼───────────┤ -│ 1 │ 1 │ 1.20649 │ -1.27628 │ -│ 2 │ 2 │ -0.917794 │ 0.940007 │ -│ 3 │ 3 │ 0.309629 │ 0.820397 │ -│ 4 │ 4 │ 1.46677 │ -1.03457 │ -│ 5 │ 5 │ 1.04339 │ -0.770464 │ -│ 6 │ 6 │ -0.172475 │ -2.81039 │ +│ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ -0.270569 │ -1.07331 │ julia> unstack(long, [:id, :a], :variable, :value) 6×4 DataFrame │ Row │ id │ a │ c │ d │ │ │ Int64 │ Int64 │ Float64? │ Float64? │ ├─────┼───────┼───────┼───────────┼───────────┤ -│ 1 │ 1 │ 1 │ 1.20649 │ -1.27628 │ -│ 2 │ 2 │ 1 │ -0.917794 │ 0.940007 │ -│ 3 │ 3 │ 2 │ 0.309629 │ 0.820397 │ -│ 4 │ 4 │ 2 │ 1.46677 │ -1.03457 │ -│ 5 │ 5 │ 3 │ 1.04339 │ -0.770464 │ -│ 6 │ 6 │ 3 │ -0.172475 │ -2.81039 │ +│ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ -0.270569 │ -1.07331 │ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) 6×3 DataFrame │ Row │ id │ _c │ _d │ │ │ Int64 │ Float64? │ Float64? │ ├─────┼───────┼───────────┼───────────┤ -│ 1 │ 1 │ 1.20649 │ -1.27628 │ -│ 2 │ 2 │ -0.917794 │ 0.940007 │ -│ 3 │ 3 │ 0.309629 │ 0.820397 │ -│ 4 │ 4 │ 1.46677 │ -1.03457 │ -│ 5 │ 5 │ 1.04339 │ -0.770464 │ -│ 6 │ 6 │ -0.172475 │ -2.81039 │ +│ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ -0.270569 │ -1.07331 │ ``` Note that there are some differences between the widened results above. """