Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
* in `describe` the specification of custom aggregation is now `function => name`;
old `name => function` order is now deprecated
([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401))
* `unstack` now does not depend on CategoricalArrays.jl and has two new keyword
arguments `allowmissing` and `allowduplicates`
* `unstack` now does produce rows and columns in the order which `groupby` produces
and has two new keyword arguments `allowmissing` and `allowduplicates`
([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494))

## New functionalities
Expand Down
37 changes: 24 additions & 13 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ end

Unstack data frame `df`, i.e. convert it from long to wide format.

Row keys and values from value column will be ordered in the order of their
appearance in the respective vectors.
Row keys and values from value column will be ordered in the order produced
by `groupby`.

# Positional arguments
- `df` : the AbstractDataFrame to be unstacked
Expand Down Expand Up @@ -359,29 +359,40 @@ unstack(df::AbstractDataFrame; renamecols::Function=identity,
unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
allowduplicates=allowduplicates)

function getrefs(g::GroupedDataFrame)
idx::Vector{Int}, starts::Vector{Int}, ends::Vector{Int} = g.idx, g.starts, g.ends
groupidxs = [idx[starts[i]:ends[i]] for i in 1:length(starts)]
ref = zeros(Int, size(parent(g), 1))
for i in 1:length(groupidxs)
ref[groupidxs[i]] .= i
# we take into account the fact that idx, starts and ends are computed lazily
# so we rather directly reference the gdf.groups
# this function is tailor made for unstack so it does assume that no groups were
# dropped (i.e. gdf.groups does not contain 0 entries)
function find_group_row(gdf::GroupedDataFrame)
rows = zeros(Int, length(gdf))
isempty(rows) && return rows

filled = 0
i = 1
while filled < length(gdf)
group = gdf.groups[i]
if rows[group] == 0
rows[group] = i
filled += 1
end
i += 1
end
return ref
return rows # return row index of first occurence of each group in gdf
end

function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
colkey::Int, g_colkey::GroupedDataFrame,
valuecol::AbstractVector, g_rowkey::GroupedDataFrame,
renamecols::Function,
allowmissing::Bool, allowduplicates::Bool)
rowref = getrefs(g_rowkey)
df1 = df[g_rowkey.idx[g_rowkey.starts], g_rowkey.cols]
rowref = g_rowkey.groups
df1 = df[find_group_row(g_rowkey), g_rowkey.cols]
Nrow = length(g_rowkey)

@assert groupcols(g_colkey) == _names(df)[colkey:colkey]
colref = getrefs(g_colkey)
colref = g_colkey.groups
Ncol = length(g_colkey)
colref_map = df[g_colkey.starts, colkey]
colref_map = df[find_group_row(g_colkey), colkey]

if any(ismissing, colref_map) && !allowmissing
throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." *
Expand Down
61 changes: 42 additions & 19 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,9 @@ const ≅ = isequal
df2 = unstack(df, :Fish, :Key, :Value)
#Unstack without specifying a row column
df3 = unstack(df, :Key, :Value)
#The expected output, XXX level should be dropped as it has no rows with this key
df4 = DataFrame(Fish = ["Batman", "Bob"],
Color = ["Grey", "Red"],
Mass = ["18 g", "12 g"])
df4 = DataFrame(Fish = ["Bob", "Batman"],
Mass = ["12 g", "18 g"],
Color = ["Red", "Grey"])
@test df2 ≅ df4
@test typeof(df2[!, :Fish]) <: Vector{String}
# first column stays as CategoricalArray in df3
Expand All @@ -66,17 +65,17 @@ const ≅ = isequal
df2 = unstack(df, :Fish, :Key, :Value)
#This changes the expected result
allowmissing!(df4, :Mass)
df4[2, :Mass] = missing
df4[1, :Mass] = missing
@test df2 ≅ df4

df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"],
Key = ["Mass", "Color", "Mass", "Color"],
Value = ["12 g", "Red", "18 g", "Grey"])
df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
df4 = DataFrame(Fish = ["Batman", "Bob"],
_COLOR_ = ["Grey", "Red"],
_MASS_ = ["18 g", "12 g"])
df4 = DataFrame(Fish = ["Bob", "Batman"],
_MASS_ = ["12 g", "18 g"],
_COLOR_ = ["Red", "Grey"])
@test df2 == df4
@test df3 == df4

Expand All @@ -90,10 +89,10 @@ const ≅ = isequal

# test missing value in grouping variable
mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4)
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)mdf
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)mdf
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)mdf
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)mdf

# test more than one grouping column
wide = DataFrame(id = 1:12,
Expand Down Expand Up @@ -169,8 +168,8 @@ end
@test_throws ArgumentError unstack(df, :variable, :value)
@test_throws ArgumentError unstack(df, :variable, :value, allowmissing=true)
udf = unstack(df, :variable, :value, allowmissing=true, renamecols=x -> coalesce(x, "MISSING"))
@test propertynames(udf) == [:id, :a, :b, :missing, :MISSING]
@test udf[!, :missing] ≅ [missing, 9.0, missing]
@test propertynames(udf) == [:id, :a, :b, :MISSING, :missing]
@test udf[!, :missing] ≅ [missing, missing, 9.0]
@test udf[!, :MISSING] ≅ [3.0, missing, missing]

df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
Expand All @@ -181,8 +180,8 @@ end
@test_throws ArgumentError unstack(df, 3, 4, allowmissing=true)
udf = unstack(df, 3, 4, allowmissing=true, renamecols=x -> coalesce(x, "MISSING"))

@test propertynames(udf) == [:id, :id2, :a, :b, :missing, :MISSING]
@test udf[!, :missing] ≅ [missing, 9.0, missing]
@test propertynames(udf) == [:id, :id2, :a, :b, :MISSING, :missing]
@test udf[!, :missing] ≅ [missing, missing, 9.0]
@test udf[!, :MISSING] ≅ [3.0, missing, missing]
end

Expand Down Expand Up @@ -499,12 +498,36 @@ end
@test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)

# an exertice on current unstack invariants
Random.seed!(1234)
for i in 1:3
for i in 1:16
df = df[Random.shuffle(1:9), :]
@test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) ==
@test unstack(df, :id, :var, :val)[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] ==
DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val)
@test unstack(df, :var, :val)[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)

df2 = copy(df)
df2.id = PooledArray(df.id)
df2.var = PooledArray(df.var)
@test unstack(df2, :id, :var, :val)[sortperm(df2.id.pool), [1; 1 .+ sortperm(df2.var.pool)]] ==
DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val)
@test unstack(df2, :var, :val)[sortperm(df2.id.pool), [1:2; 2 .+ sortperm(df2.var.pool)]] ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)

df2 = categorical(df, 1:3)
@test unstack(df2, :id, :var, :val) ==
DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
levels!(df2.id, [10, 2, 11, 3, 1, 12])
levels!(df2.var, ['x', 'b', 'y', 'c', 'a', 'z'])
@test unstack(df2, :id, :var, :val) ==
DataFrame(id=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
@test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
DataFrame(id=1:3, id2=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
end

df = DataFrame(id=repeat(1:3, inner=3),
Expand Down