diff --git a/NEWS.md b/NEWS.md index fff1f8c405..43ac61d13d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -41,6 +41,9 @@ * in `describe` the specification of custom aggregation is now `function => name`; old `name => function` order is now deprecated ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401)) +* `unstack` now produces row and column keys in the order of their first appearance + and has two new keyword arguments `allowmissing` and `allowduplicates` + ([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494)) ## New functionalities diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 7107f7184f..d75237bc1e 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -58,7 +58,9 @@ julia> last(d, 6) │ 6 │ Iris-virginica │ PetalWidth │ 1.8 │ ``` -The second optional argument to `stack` indicates the columns to be stacked. These are normally referred to as the measured variables. Column names can also be given: +The second optional argument to `stack` indicates the columns to be stacked. +These are normally referred to as the measured variables. Column names can also +be given: ```jldoctest reshape julia> d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]); @@ -88,11 +90,18 @@ julia> last(d, 6) │ 6 │ Iris-virginica │ PetalWidth │ 1.8 │ ``` -Note that all columns can be of different types. Type promotion follows the rules of `vcat`. +Note that all columns can be of different types. Type promotion follows the +rules of `vcat`. -The stacked `DataFrame` that results includes all of the columns not specified to be stacked. These are repeated for each stacked column. These are normally refered to as identifier (id) columns. In addition to the id columns, two additional columns labeled `:variable` and `:values` contain the column identifier and the stacked columns. +The stacked `DataFrame` that results includes all of the columns not specified +to be stacked. These are repeated for each stacked column. These are normally +refered to as identifier (id) columns. In addition to the id columns, two +additional columns labeled `:variable` and `:values` contain the column +identifier and the stacked columns. -A third optional argument to `stack` represents the id columns that are repeated. This makes it easier to specify which variables you want included in the long format: +A third optional argument to `stack` represents the id columns that are +repeated. This makes it easier to specify which variables you want included in +the long format: ```jldoctest reshape julia> d = stack(iris, [:SepalLength, :SepalWidth], :Species); @@ -152,7 +161,9 @@ julia> last(d, 6) │ 6 │ Iris-virginica │ PetalWidth │ 1.8 │ ``` -`unstack` converts from a long format to a wide format. The default is requires specifying which columns are an id variable, column variable names, and column values: +`unstack` converts from a long format to a wide format. +The default is requires specifying which columns are an id variable, +column variable names, and column values: ```jldoctest reshape julia> iris.id = 1:size(iris, 1) @@ -267,7 +278,8 @@ julia> last(widedf, 6) │ 6 │ Iris-virginica │ 150 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ ``` -You can even skip passing the `:variable` and `:value` values as positional arguments, as they will be used by default, and write: +You can even skip passing the `:variable` and `:value` values as positional +arguments, as they will be used by default, and write: ```jldoctest reshape julia> widedf = unstack(longdf); @@ -296,7 +308,8 @@ julia> last(widedf, 6) │ 6 │ Iris-virginica │ 150 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ ``` -Passing `view=true` to `stack` returns a data frame whose columns are views into the original wide data frame. Here is an example: +Passing `view=true` to `stack` returns a data frame whose columns are views into +the original wide data frame. Here is an example: ```jldoctest reshape julia> d = stack(iris, view=true); @@ -337,7 +350,9 @@ This is provides a view of the original columns stacked together. Id columns -- `RepeatedVector` This repeats the original columns N times where N is the number of columns stacked. -None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example: +None of these reshaping functions perform any aggregation. To do aggregation, +use the split-apply-combine functions in combination with reshaping. Here is an +example: ```jldoctest reshape julia> using Statistics @@ -356,7 +371,7 @@ julia> first(d, 6) │ 5 │ Iris-setosa │ SepalLength │ 5.0 │ │ 6 │ Iris-setosa │ SepalLength │ 5.4 │ -julia> x = by(d, [:variable, :Species], :value => mean => :vsum); +julia> x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum); julia> first(x, 6) │ Row │ variable │ Species │ vsum │ diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 4f5cca7d1d..4d5dd80df8 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -15,7 +15,6 @@ If `view=true` then return a stacked view of a data frame (long format). The result is a view because the columns are special `AbstractVectors` that return views into the original data frame. - # Arguments - `df` : the AbstractDataFrame to be stacked - `measure_vars` : the columns to be stacked (the measurement variables), @@ -43,16 +42,93 @@ that return views into the original data frame. # Examples ```julia -d1 = DataFrame(a = repeat([1:3;], inner = [4]), - b = repeat([1:4;], inner = [3]), - c = randn(12), - d = randn(12), - e = map(string, 'a':'l')) - -d1s = stack(d1, [:c, :d]) -d1s2 = stack(d1, [:c, :d], [:a]) -d1m = stack(d1, Not([:a, :b, :e])) -d1s_name = stack(d1, Not([:a, :b, :e]), variable_name=:somemeasure) +julia> df = DataFrame(a = repeat([1:3;], inner = [2]), + b = repeat([1:2;], inner = [3]), + c = randn(6), + d = randn(), + e = map(string, 'a':'f')) +6×5 DataFrame +│ Row │ a │ b │ c │ d │ e │ +│ │ Int64 │ Int64 │ Float64 │ Float64 │ String │ +├─────┼───────┼───────┼──────────┼──────────┼────────┤ +│ 1 │ 1 │ 1 │ -1.1078 │ 0.680175 │ a │ +│ 2 │ 1 │ 1 │ 0.078634 │ 0.680175 │ b │ +│ 3 │ 2 │ 1 │ -1.47615 │ 0.680175 │ c │ +│ 4 │ 2 │ 2 │ 0.826434 │ 0.680175 │ d │ +│ 5 │ 3 │ 2 │ 0.597258 │ 0.680175 │ e │ +│ 6 │ 3 │ 2 │ 1.49645 │ 0.680175 │ f │ + +julia> stack(df, [:c, :d]) +12×5 DataFrame +│ Row │ a │ b │ e │ variable │ value │ +│ │ Int64 │ Int64 │ String │ String │ Float64 │ +├─────┼───────┼───────┼────────┼──────────┼──────────┤ +│ 1 │ 1 │ 1 │ a │ c │ -1.1078 │ +│ 2 │ 1 │ 1 │ b │ c │ 0.078634 │ +│ 3 │ 2 │ 1 │ c │ c │ -1.47615 │ +│ 4 │ 2 │ 2 │ d │ c │ 0.826434 │ +│ 5 │ 3 │ 2 │ e │ c │ 0.597258 │ +│ 6 │ 3 │ 2 │ f │ c │ 1.49645 │ +│ 7 │ 1 │ 1 │ a │ d │ 0.680175 │ +│ 8 │ 1 │ 1 │ b │ d │ 0.680175 │ +│ 9 │ 2 │ 1 │ c │ d │ 0.680175 │ +│ 10 │ 2 │ 2 │ d │ d │ 0.680175 │ +│ 11 │ 3 │ 2 │ e │ d │ 0.680175 │ +│ 12 │ 3 │ 2 │ f │ d │ 0.680175 │ + +julia> stack(df, [:c, :d], [:a]) +12×3 DataFrame +│ Row │ a │ variable │ value │ +│ │ Int64 │ String │ Float64 │ +├─────┼───────┼──────────┼──────────┤ +│ 1 │ 1 │ c │ -1.1078 │ +│ 2 │ 1 │ c │ 0.078634 │ +│ 3 │ 2 │ c │ -1.47615 │ +│ 4 │ 2 │ c │ 0.826434 │ +│ 5 │ 3 │ c │ 0.597258 │ +│ 6 │ 3 │ c │ 1.49645 │ +│ 7 │ 1 │ d │ 0.680175 │ +│ 8 │ 1 │ d │ 0.680175 │ +│ 9 │ 2 │ d │ 0.680175 │ +│ 10 │ 2 │ d │ 0.680175 │ +│ 11 │ 3 │ d │ 0.680175 │ +│ 12 │ 3 │ d │ 0.680175 │ + +julia> stack(df, Not([:a, :b, :e])) +12×5 DataFrame +│ Row │ a │ b │ e │ variable │ value │ +│ │ Int64 │ Int64 │ String │ String │ Float64 │ +├─────┼───────┼───────┼────────┼──────────┼──────────┤ +│ 1 │ 1 │ 1 │ a │ c │ -1.1078 │ +│ 2 │ 1 │ 1 │ b │ c │ 0.078634 │ +│ 3 │ 2 │ 1 │ c │ c │ -1.47615 │ +│ 4 │ 2 │ 2 │ d │ c │ 0.826434 │ +│ 5 │ 3 │ 2 │ e │ c │ 0.597258 │ +│ 6 │ 3 │ 2 │ f │ c │ 1.49645 │ +│ 7 │ 1 │ 1 │ a │ d │ 0.680175 │ +│ 8 │ 1 │ 1 │ b │ d │ 0.680175 │ +│ 9 │ 2 │ 1 │ c │ d │ 0.680175 │ +│ 10 │ 2 │ 2 │ d │ d │ 0.680175 │ +│ 11 │ 3 │ 2 │ e │ d │ 0.680175 │ +│ 12 │ 3 │ 2 │ f │ d │ 0.680175 │ + +julia> stack(df, Not([:a, :b, :e]), variable_name=:somemeasure) +12×5 DataFrame +│ Row │ a │ b │ e │ somemeasure │ value │ +│ │ Int64 │ Int64 │ String │ String │ Float64 │ +├─────┼───────┼───────┼────────┼─────────────┼──────────┤ +│ 1 │ 1 │ 1 │ a │ c │ -1.1078 │ +│ 2 │ 1 │ 1 │ b │ c │ 0.078634 │ +│ 3 │ 2 │ 1 │ c │ c │ -1.47615 │ +│ 4 │ 2 │ 2 │ d │ c │ 0.826434 │ +│ 5 │ 3 │ 2 │ e │ c │ 0.597258 │ +│ 6 │ 3 │ 2 │ f │ c │ 1.49645 │ +│ 7 │ 1 │ 1 │ a │ d │ 0.680175 │ +│ 8 │ 1 │ 1 │ b │ d │ 0.680175 │ +│ 9 │ 2 │ 1 │ c │ d │ 0.680175 │ +│ 10 │ 2 │ 2 │ d │ d │ 0.680175 │ +│ 11 │ 3 │ 2 │ e │ d │ 0.680175 │ +│ 12 │ 3 │ 2 │ f │ d │ 0.680175 │ ``` """ function stack(df::AbstractDataFrame, @@ -121,19 +197,18 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int}, end """ - unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity) - unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity) - unstack(df::AbstractDataFrame; renamecols::Function=identity) + unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) + unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) + unstack(df::AbstractDataFrame; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) Unstack data frame `df`, i.e. convert it from long to wide format. -If `colkey` contains `missing` values then they will be skipped and a warning -will be printed. - -If combination of `rowkeys` and `colkey` contains duplicate entries then last -`value` will be retained and a warning will be printed. +Row and column keys will be ordered in the order of their first appearance. -# Arguments +# Positional arguments - `df` : the AbstractDataFrame to be unstacked - `rowkeys` : the columns with a unique key for each row, if not given, find a key by grouping on anything not a `colkey` or `value`. @@ -141,150 +216,223 @@ If combination of `rowkeys` and `colkey` contains duplicate entries then last - `colkey` : the column ($COLUMNINDEX_STR) holding the column names in wide format, defaults to `:variable` - `value` : the value column ($COLUMNINDEX_STR), defaults to `:value` -- `renamecols` : a function called on each unique value in `colkey` which must - return the name of the column to be created (typically as a string - or a `Symbol`). Duplicate names are not allowed. +# Keyword arguments + +- `renamecols`: a function called on each unique value in `colkey`; it must return + the name of the column to be created (typically as a string or a `Symbol`). + Duplicates in resulting names when converted to `Symbol` are not allowed. + By default no transformation is performed. +- `allowmissing`: if `false` (the default) then an error will be thrown if `colkey` + contains `missing` values; if `true` then a column referring to `missing` value + will be created. +- allowduplicates`: if `false` (the default) then an error an error will be thrown + if combination of `rowkeys` and `colkey` contains duplicate entries; if `true` + then then the last encountered `value` will be retained. # Examples + ```julia -wide = DataFrame(id = 1:12, - a = repeat([1:3;], inner = [4]), - b = repeat([1:4;], inner = [3]), - c = randn(12), - d = randn(12)) - -long = stack(wide) -wide0 = unstack(long) -wide1 = unstack(long, :variable, :value) -wide2 = unstack(long, :id, :variable, :value) -wide3 = unstack(long, [:id, :a], :variable, :value) -wide4 = unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) +julia> wide = DataFrame(id = 1:6, + a = repeat([1:3;], inner = [2]), + b = repeat([1:2;], inner = [3]), + c = randn(6), + d = randn(6)) +6×5 DataFrame +│ Row │ id │ a │ b │ c │ d │ +│ │ Int64 │ Int64 │ Int64 │ Float64 │ Float64 │ +├─────┼───────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 2 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ 2 │ -0.270569 │ -1.07331 │ + +julia> long = stack(wide) +12×5 DataFrame +│ Row │ id │ a │ b │ variable │ value │ +│ │ Int64 │ Int64 │ Int64 │ String │ Float64 │ +├─────┼───────┼───────┼───────┼──────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ c │ -1.07327 │ +│ 2 │ 2 │ 1 │ 1 │ c │ -0.334919 │ +│ 3 │ 3 │ 2 │ 1 │ c │ 1.73213 │ +│ 4 │ 4 │ 2 │ 2 │ c │ 0.883706 │ +│ 5 │ 5 │ 3 │ 2 │ c │ 0.919183 │ +│ 6 │ 6 │ 3 │ 2 │ c │ -0.270569 │ +│ 7 │ 1 │ 1 │ 1 │ d │ -0.948501 │ +│ 8 │ 2 │ 1 │ 1 │ d │ 1.00158 │ +│ 9 │ 3 │ 2 │ 1 │ d │ -0.97692 │ +│ 10 │ 4 │ 2 │ 2 │ d │ -2.15281 │ +│ 11 │ 5 │ 3 │ 2 │ d │ -0.700637 │ +│ 12 │ 6 │ 3 │ 2 │ d │ -1.07331 │ + +julia> unstack(long) +6×5 DataFrame +│ Row │ id │ a │ b │ c │ d │ +│ │ Int64 │ Int64 │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 2 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ 2 │ -0.270569 │ -1.07331 │ + +julia> unstack(long, :variable, :value) +6×5 DataFrame +│ Row │ id │ a │ b │ c │ d │ +│ │ Int64 │ Int64 │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 2 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ 2 │ -0.270569 │ -1.07331 │ + +julia> unstack(long, :id, :variable, :value) +6×3 DataFrame +│ Row │ id │ c │ d │ +│ │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ -0.270569 │ -1.07331 │ + +julia> unstack(long, [:id, :a], :variable, :value) +6×4 DataFrame +│ Row │ id │ a │ c │ d │ +│ │ Int64 │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ 1 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 2 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 2 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 3 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ 3 │ -0.270569 │ -1.07331 │ + +julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) +6×3 DataFrame +│ Row │ id │ _c │ _d │ +│ │ Int64 │ Float64? │ Float64? │ +├─────┼───────┼───────────┼───────────┤ +│ 1 │ 1 │ -1.07327 │ -0.948501 │ +│ 2 │ 2 │ -0.334919 │ 1.00158 │ +│ 3 │ 3 │ 1.73213 │ -0.97692 │ +│ 4 │ 4 │ 0.883706 │ -2.15281 │ +│ 5 │ 5 │ 0.919183 │ -0.700637 │ +│ 6 │ 6 │ -0.270569 │ -1.07331 │ ``` Note that there are some differences between the widened results above. """ -function unstack(df::AbstractDataFrame, rowkey::ColumnIndex, colkey::ColumnIndex, - value::ColumnIndex; renamecols::Function=identity) - refkeycol = categorical(df[!, rowkey]) - droplevels!(refkeycol) - keycol = categorical(df[!, colkey]) - droplevels!(keycol) - valuecol = df[!, value] - return _unstack(df, index(df)[rowkey], index(df)[colkey], - keycol, valuecol, refkeycol, renamecols) -end - -function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, - keycol::CategoricalVector, valuecol::AbstractVector, - refkeycol::CategoricalVector, renamecols::Function) - Nrow = length(refkeycol.pool) - Ncol = length(keycol.pool) - unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] - hadmissing = false # have we encountered missing in refkeycol - mask_filled = falses(Nrow+1, Ncol) # has a given [row,col] entry been filled? - warned_dup = false # have we already printed duplicate entries warning? - warned_missing = false # have we already printed missing in keycol warning? - for k in 1:nrow(df) - kref = keycol.refs[k] - if kref <= 0 # we have found missing in colkey - if !warned_missing - @warn("Missing value in variable :$(_names(df)[colkey]) at row $k. Skipping.") - warned_missing = true - end - continue # skip processing it - end - refkref = refkeycol.refs[k] - if refkref <= 0 # we have found missing in rowkey - if !hadmissing # if it is the first time we have to add a new row - hadmissing = true - # we use the fact that missing is greater than anything - for i in eachindex(unstacked_val) - push!(unstacked_val[i], missing) - end - end - i = length(unstacked_val[1]) - else - i = refkref - end - if !warned_dup && mask_filled[i, kref] - @warn("Duplicate entries in unstack at row $k for key "* - "$(refkeycol[k]) and variable $(keycol[k]).") - warned_dup = true - end - unstacked_val[kref][i] = valuecol[k] - mask_filled[i, kref] = true - end - levs = levels(refkeycol) - # we have to handle a case with missings in refkeycol as levs will skip missing - col = similar(df[!, rowkey], length(levs) + hadmissing) - copyto!(col, levs) - hadmissing && (col[end] = missing) - df2 = DataFrame(unstacked_val, Symbol.(renamecols.(levels(keycol))), copycols=false) - return insertcols!(df2, 1, _names(df)[rowkey] => col) -end - function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, - value::ColumnIndex; renamecols::Function=identity) - rowkey_ints = index(df)[rowkeys] + value::ColumnIndex; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) + rowkey_ints = vcat(index(df)[rowkeys]) @assert rowkey_ints isa AbstractVector{Int} length(rowkey_ints) == 0 && throw(ArgumentError("No key column found")) - length(rowkey_ints) == 1 && return unstack(df, rowkey_ints[1], colkey, value, - renamecols=renamecols) - g = groupby(df, rowkey_ints, sort=true) - keycol = categorical(df[!, colkey]) - droplevels!(keycol) + g_rowkey = groupby(df, rowkey_ints) + g_colkey = groupby(df, colkey) valuecol = df[!, value] - return _unstack(df, rowkey_ints, index(df)[colkey], keycol, valuecol, g, renamecols) + return _unstack(df, rowkey_ints, index(df)[colkey], g_colkey, + valuecol, g_rowkey, renamecols, allowmissing, allowduplicates) end function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex; - renamecols::Function=identity) + renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) colkey_int = index(df)[colkey] value_int = index(df)[value] return unstack(df, Not(colkey_int, value_int), colkey_int, value_int, - renamecols=renamecols) + renamecols=renamecols, allowmissing=allowmissing, + allowduplicates=allowduplicates) end -unstack(df::AbstractDataFrame; renamecols::Function=identity) = - unstack(df, :variable, :value, renamecols=renamecols) +unstack(df::AbstractDataFrame; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false) = + unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing, + allowduplicates=allowduplicates) + +# we take into account the fact that idx, starts and ends are computed lazily +# so we rather directly reference the gdf.groups +# this function is tailor made for unstack so it does assume that no groups were +# dropped (i.e. gdf.groups does not contain 0 entries) +function find_group_row(gdf::GroupedDataFrame) + rows = zeros(Int, length(gdf)) + isempty(rows) && return rows + + filled = 0 + i = 1 + groups = gdf.groups + while filled < length(gdf) + group = groups[i] + if rows[group] == 0 + rows[group] = i + filled += 1 + end + i += 1 + end + return rows # return row index of first occurrence of each group in gdf.groups +end function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, - colkey::Int, keycol::CategoricalVector, - valuecol::AbstractVector, g::GroupedDataFrame, - renamecols::Function) - idx, starts, ends = g.idx, g.starts, g.ends - groupidxs = [idx[starts[i]:ends[i]] for i in 1:length(starts)] - rowkey = zeros(Int, size(df, 1)) - for i in 1:length(groupidxs) - rowkey[groupidxs[i]] .= i + colkey::Int, g_colkey::GroupedDataFrame, + valuecol::AbstractVector, g_rowkey::GroupedDataFrame, + renamecols::Function, + allowmissing::Bool, allowduplicates::Bool) + rowref = g_rowkey.groups + row_group_row_idxs = find_group_row(g_rowkey) + Nrow = length(g_rowkey) + + @assert groupcols(g_colkey) == _names(df)[colkey:colkey] + colref = g_colkey.groups + Ncol = length(g_colkey) + col_group_row_idxs = find_group_row(g_colkey) + colref_map = df[col_group_row_idxs, colkey] + + if any(ismissing, colref_map) && !allowmissing + throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." * + " Pass `allowmissing=true` to skip missings.")) end - df1 = df[idx[starts], g.cols] - Nrow = length(g) - Ncol = length(levels(keycol)) + unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol] mask_filled = falses(Nrow, Ncol) - warned_dup = false - warned_missing = false - for k in 1:nrow(df) - kref = keycol.refs[k] - if kref <= 0 - if !warned_missing - @warn("Missing value in variable :$(_names(df)[colkey]) at row $k. Skipping.") - warned_missing = true - end - continue - end - i = rowkey[k] - if !warned_dup && mask_filled[i, kref] - @warn("Duplicate entries in unstack at row $k for key "* - "$(tuple((df[k,s] for s in rowkeys)...)) and variable $(keycol[k]).") - warned_dup = true + + @assert length(rowref) == length(colref) == length(valuecol) + for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) + if !allowduplicates && mask_filled[row_id, col_id] + throw(ArgumentError("Duplicate entries in unstack at row $k for key "* + "$(tuple((df[k, s] for s in rowkeys)...)) and variable $(colref_map[col_id]). " * + "Pass allowduplicates=true to allow them.")) end - unstacked_val[kref][i] = valuecol[k] - mask_filled[i, kref] = true + unstacked_val[col_id][row_id] = val + mask_filled[row_id, col_id] = true + end + + # note that Symbol.(renamecols.(colref_map)) must produce unique column names + # and names between df1 and df2 must be unique + df1 = df[row_group_row_idxs, g_rowkey.cols] + df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false) + + @assert length(col_group_row_idxs) == ncol(df2) + # avoid reordering when col_group_row_idxs was already ordered + if !issorted(col_group_row_idxs) + df2 = df2[!, sortperm(col_group_row_idxs)] + end + + res_df = hcat(df1, df2, copycols=false) + + @assert length(row_group_row_idxs) == nrow(res_df) + # avoid reordering when col_group_row_idxs was already ordered + if !issorted(row_group_row_idxs) + res_df = res_df[sortperm(row_group_row_idxs), :] end - df2 = DataFrame(unstacked_val, Symbol.(renamecols.(levels(keycol))), copycols=false) - hcat(df1, df2, copycols=false) + + return res_df end """ @@ -336,10 +484,6 @@ Base.eltype(v::Type{StackedVector{T}}) where {T} = T Base.similar(v::StackedVector, T::Type, dims::Union{Integer, AbstractUnitRange}...) = similar(v.components[1], T, dims...) -CategoricalArrays.CategoricalArray(v::StackedVector) = - CategoricalArray(v[:]) # could be more efficient - - """ RepeatedVector{T} <: AbstractVector{T} @@ -376,9 +520,6 @@ struct RepeatedVector{T} <: AbstractVector{T} end Base.parent(v::RepeatedVector) = v.parent -DataAPI.levels(v::RepeatedVector) = levels(parent(v)) -CategoricalArrays.isordered(v::RepeatedVector{<:Union{CategoricalValue, Missing}}) = - isordered(parent(v)) function Base.getindex(v::RepeatedVector, i::Int) N = length(parent(v)) @@ -394,13 +535,6 @@ Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(parent(v)), v.inner, v. Base.similar(v::RepeatedVector, T::Type, dims::Dims) = similar(parent(v), T, dims) Base.unique(v::RepeatedVector) = unique(parent(v)) -function CategoricalArrays.CategoricalArray(v::RepeatedVector) - res = CategoricalArray(parent(v), levels=levels(parent(v))) - res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) - res -end - - Base.transpose(::AbstractDataFrame, args...; kwargs...) = MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") diff --git a/test/reshape.jl b/test/reshape.jl index 6aeca3dc04..480bb2d069 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -1,6 +1,6 @@ module TestReshape -using Test, DataFrames, Random, Logging, PooledArrays, CategoricalArrays +using Test, DataFrames, Random, PooledArrays, CategoricalArrays const ≅ = isequal @testset "the output of unstack" begin @@ -16,10 +16,10 @@ const ≅ = isequal @test levels(df[!, 2]) == ["YYY", "Color", "Mass"] # make sure we did not mess df[!, 2] levels #Unstack without specifying a row column df3 = unstack(df, :Key, :Value) - #The expected output, XXX level should be dropped as it has no rows with this key + # The expected output is in order of appearance df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"], - Color = Union{String, Missing}["Red", "Grey"], - Mass = Union{String, Missing}["12 g", "18 g"]) + Mass = Union{String, Missing}["12 g", "18 g"], + Color = Union{String, Missing}["Red", "Grey"]) @test df2 ≅ df4 @test typeof(df2[!, :Fish]) <: CategoricalVector{Union{String, Missing}} # first column stays as CategoricalArray in df3 @@ -39,8 +39,8 @@ const ≅ = isequal df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"], - _COLOR_ = Union{String, Missing}["Red", "Grey"], - _MASS_ = Union{String, Missing}["12 g", "18 g"]) + _MASS_ = Union{String, Missing}["12 g", "18 g"], + _COLOR_ = Union{String, Missing}["Red", "Grey"]) @test df2 == df4 @test df3 == df4 @@ -52,10 +52,9 @@ const ≅ = isequal df2 = unstack(df, :Fish, :Key, :Value) #Unstack without specifying a row column df3 = unstack(df, :Key, :Value) - #The expected output, XXX level should be dropped as it has no rows with this key - df4 = DataFrame(Fish = ["Batman", "Bob"], - Color = ["Grey", "Red"], - Mass = ["18 g", "12 g"]) + df4 = DataFrame(Fish = ["Bob", "Batman"], + Mass = ["12 g", "18 g"], + Color = ["Red", "Grey"]) @test df2 ≅ df4 @test typeof(df2[!, :Fish]) <: Vector{String} # first column stays as CategoricalArray in df3 @@ -66,7 +65,7 @@ const ≅ = isequal df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result allowmissing!(df4, :Mass) - df4[2, :Mass] = missing + df4[1, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"], @@ -74,9 +73,9 @@ const ≅ = isequal Value = ["12 g", "Red", "18 g", "Grey"]) df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_")) - df4 = DataFrame(Fish = ["Batman", "Bob"], - _COLOR_ = ["Grey", "Red"], - _MASS_ = ["18 g", "12 g"]) + df4 = DataFrame(Fish = ["Bob", "Batman"], + _MASS_ = ["12 g", "18 g"], + _COLOR_ = ["Red", "Grey"]) @test df2 == df4 @test df3 == df4 @@ -90,10 +89,10 @@ const ≅ = isequal # test missing value in grouping variable mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4) - @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] - @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] - @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] - @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] + @test unstack(stack(mdf, Not(:id)), :id, :variable, :value) ≅ mdf + @test unstack(stack(mdf, Not(1)), :id, :variable, :value) ≅ mdf + @test unstack(stack(mdf, Not(:id)), :id, :variable, :value) ≅ mdf + @test unstack(stack(mdf, Not(1)), :id, :variable, :value) ≅ mdf # test more than one grouping column wide = DataFrame(id = 1:12, @@ -143,46 +142,47 @@ end df = DataFrame(id=Union{Int, Missing}[1, 2, 1, 2], id2=Union{Int, Missing}[1, 2, 1, 2], variable=["a", "b", "a", "b"], value=[3, 4, 5, 6]) - @test_logs (:warn, "Duplicate entries in unstack at row 3 for key 1 and variable a.") unstack(df, :id, :variable, :value) - @test_logs (:warn, "Duplicate entries in unstack at row 3 for key (1, 1) and variable a.") unstack(df, :variable, :value) - a, b = with_logger(NullLogger()) do - unstack(df, :id, :variable, :value), unstack(df, :variable, :value) - end + @test_throws ArgumentError unstack(df, :id, :variable, :value) + @test_throws ArgumentError unstack(df, :variable, :value) + a = unstack(df, :id, :variable, :value, allowduplicates=true) + b = unstack(df, :variable, :value, allowduplicates=true) @test a ≅ DataFrame(id = [1, 2], a = [5, missing], b = [missing, 6]) @test b ≅ DataFrame(id = [1, 2], id2 = [1, 2], a = [5, missing], b = [missing, 6]) df = DataFrame(id=1:2, variable=["a", "b"], value=3:4) - @test_nowarn unstack(df, :id, :variable, :value) - @test_nowarn unstack(df, :variable, :value) a = unstack(df, :id, :variable, :value) b = unstack(df, :variable, :value) @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4]) - df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1]) - @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value) - @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value) + df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1]) + @test_throws ArgumentError unstack(df, :variable, :value) + @test_throws ArgumentError unstack(df, :id, :variable, :value) + @test unstack(df, :variable, :value, allowduplicates=true) ≅ DataFrame(id=1, x=missing) + @test unstack(df, :id, :variable, :value, allowduplicates=true) ≅ DataFrame(id=1, x=missing) end @testset "missing values in colkey" begin df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2], variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"], value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0]) - @test_logs (:warn, "Missing value in variable :variable at row 3. Skipping.") unstack(df, :variable, :value) - udf = with_logger(NullLogger()) do - unstack(df, :variable, :value) - end - @test propertynames(udf) == [:id, :a, :b, :missing] - @test udf[!, :missing] ≅ [missing, 9.0, missing] + @test_throws ArgumentError unstack(df, :variable, :value) + @test_throws ArgumentError unstack(df, :variable, :value, allowmissing=true) + udf = unstack(df, :variable, :value, allowmissing=true, renamecols=x -> coalesce(x, "MISSING")) + @test propertynames(udf) == [:id, :a, :b, :MISSING, :missing] + @test udf[!, :missing] ≅ [missing, missing, 9.0] + @test udf[!, :MISSING] ≅ [3.0, missing, missing] + df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2], id2=[1, 1, 1, missing, missing, missing, 2, 2, 2], variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"], value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0]) - @test_logs (:warn, "Missing value in variable :variable at row 3. Skipping.") unstack(df, 3, 4) - udf = with_logger(NullLogger()) do - unstack(df, 3, 4) - end - @test propertynames(udf) == [:id, :id2, :a, :b, :missing] - @test udf[!, :missing] ≅ [missing, 9.0, missing] + @test_throws ArgumentError unstack(df, 3, 4) + @test_throws ArgumentError unstack(df, 3, 4, allowmissing=true) + udf = unstack(df, 3, 4, allowmissing=true, renamecols=x -> coalesce(x, "MISSING")) + + @test propertynames(udf) == [:id, :id2, :a, :b, :MISSING, :missing] + @test udf[!, :missing] ≅ [missing, missing, 9.0] + @test udf[!, :MISSING] ≅ [3.0, missing, missing] end @testset "stack-unstack correctness" begin @@ -419,24 +419,6 @@ end @test_throws ArgumentError flatten(df_bad, [:b, :c]) end -@testset "test RepeatedVector for categorical" begin - v = categorical(["a", "b", "c"], ordered=true) - levels!(v, ["b", "c", "a"]) - rv = DataFrames.RepeatedVector(v, 1, 1) - @test isordered(v) - @test isordered(categorical(v)) - @test levels(v) == ["b", "c", "a"] - @test levels(categorical(v)) == ["b", "c", "a"] - - v = categorical(["a", "b", "c"]) - levels!(v, ["b", "c", "a"]) - rv = DataFrames.RepeatedVector(v, 1, 1) - @test !isordered(v) - @test !isordered(categorical(v)) - @test levels(v) == ["b", "c", "a"] - @test levels(categorical(v)) == ["b", "c", "a"] -end - @testset "stack categorical test" begin Random.seed!(1234) d1 = DataFrame(a = repeat([1:3;], inner = [4]), @@ -507,6 +489,97 @@ end @test eltype(typeof(sdf2.value)) === Float64 end +@testset "additional unstack tests" begin + df = DataFrame(id=repeat(1:3, inner=3), + id2=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + @test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) == + DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + + # make sure we always use order of appereance + Random.seed!(1234) + # Use a large value to test several orders of appearance + for i in 1:16 + df = df[Random.shuffle(1:9), :] + wide1 = unstack(df, :id, :var, :val) + wide2 = unstack(df, [:id, :id2], :var, :val) + wide3 = unstack(df, :var, :val) + @test wide1[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] == + DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test wide2[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] == + DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test wide2 == wide3 + + df2 = copy(df) + df2.id = PooledArray(df.id) + df2.var = PooledArray(df.var) + @test unstack(df2, :id, :var, :val) == wide1 + @test unstack(df2, [:id, :id2], :var, :val) == wide2 + @test unstack(df2, :var, :val) == wide3 + + df2 = categorical(df, 1:3) + @test unstack(df2, :id, :var, :val) == wide1 + @test unstack(df2, [:id, :id2], :var, :val) == wide2 + @test unstack(df2, :var, :val) == wide3 + levels!(df2.id, [10, 2, 11, 3, 1, 12]) + levels!(df2.var, ['x', 'b', 'y', 'c', 'a', 'z']) + @test unstack(df2, :id, :var, :val) == wide1 + @test unstack(df2, [:id, :id2], :var, :val) == wide2 + @test unstack(df2, :var, :val) == wide3 + end + + df = DataFrame(id=repeat(1:3, inner=3), + a=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + @test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9) + @test_throws ArgumentError unstack(df, :a, :var, :val) + @test_throws ArgumentError unstack(df, [:id, :a], :var, :val) + + df = DataFrame(id=repeat(1:3, inner=3), + id2=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + df[4, 1:2] .= 1 + @test_throws ArgumentError unstack(df, :id, :var, :val) + @test_throws ArgumentError unstack(df, [:id, :id2], :var, :val) + @test unstack(df, :id, :var, :val, allowduplicates=true) ≅ + DataFrame(id=1:3, a=[4, missing, 7], b=2:3:8, c=3:3:9) + @test unstack(df, [:id, :id2], :var, :val, allowduplicates=true) ≅ + DataFrame(id=1:3, id2=1:3, a=[4, missing, 7], b=2:3:8, c=3:3:9) + + df = DataFrame(id=repeat(1:3, inner=3), + id2=repeat(1:3, inner=3), + var=repeat('a':'c', 3), + val=1:9) + allowmissing!(df, :var) + df.var[4] = missing + @test_throws ArgumentError unstack(df, :id, :var, :val) + @test_throws ArgumentError unstack(df, [:id, :id2], :var, :val) + @test unstack(df, :id, :var, :val, allowmissing=true) ≅ + DataFrame(id=1:3, a=[1, missing, 7], b=2:3:8, c=3:3:9, missing=[missing, 4, missing]) + @test unstack(df, [:id, :id2], :var, :val, allowmissing=true) ≅ + DataFrame(id=1:3, id2=1:3, a=[1, missing, 7], b=2:3:8, c=3:3:9, missing=[missing, 4, missing]) +end + +# test scenario when sorting fails both in grouping and in variable +struct A_TYPE + x +end + +@testset "additional unstack tests not sortable" begin + df = DataFrame(id=repeat(A_TYPE.([2, 1, 3]), inner=3), + id2=repeat(A_TYPE.([2, 1, 3]), inner=3), + var=repeat(A_TYPE.([3, 2, 1]), 3), + val=1:9) + @test unstack(df, :id, :var, :val, renamecols=x -> Symbol(:x, x.x)) == + DataFrame(id=A_TYPE.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) + @test unstack(df, [:id, :id2], :var, :val, renamecols=x -> Symbol(:x, x.x)) == + DataFrame(id=A_TYPE.([2, 1, 3]), id2=A_TYPE.([2, 1, 3]), x3=1:3:7, x2=2:3:8, x1=3:3:9) +end + @testset "permutedims" begin df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2))