JuliaData · bkamins · Oct 31, 2020 · Oct 13, 2020 · Oct 21, 2020 · Oct 21, 2020
diff --git a/NEWS.md b/NEWS.md
@@ -41,8 +41,8 @@
 * in `describe` the specification of custom aggregation is now `function => name`;
   old `name => function` order is now deprecated
   ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401))
-* `unstack` now does not depend on CategoricalArrays.jl and has two new keyword
-  arguments `allowmissing` and `allowduplicates`
+* `unstack` now does produce rows and columns in the order which `groupby` produces
+   and has two new keyword arguments `allowmissing` and `allowduplicates`
   ([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494))
 
 ## New functionalities

diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -207,8 +207,8 @@ end
 
 Unstack data frame `df`, i.e. convert it from long to wide format.
 
-Row keys and values from value column will be ordered in the order of their
-appearance in the respective vectors.
+Row keys and values from value column will be ordered in the order produced
+by `groupby`.
 
 # Positional arguments
 - `df` : the AbstractDataFrame to be unstacked
@@ -359,29 +359,40 @@ unstack(df::AbstractDataFrame; renamecols::Function=identity,
     unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
             allowduplicates=allowduplicates)
 
-function getrefs(g::GroupedDataFrame)
-    idx::Vector{Int}, starts::Vector{Int}, ends::Vector{Int} = g.idx, g.starts, g.ends
-    groupidxs = [idx[starts[i]:ends[i]] for i in 1:length(starts)]
-    ref = zeros(Int, size(parent(g), 1))
-    for i in 1:length(groupidxs)
-        ref[groupidxs[i]] .= i
+# we take into account the fact that idx, starts and ends are computed lazily
+# so we rather directly reference the gdf.groups
+# this function is tailor made for unstack so it does assume that no groups were
+# dropped (i.e. gdf.groups does not contain 0 entries)
+function find_group_row(gdf::GroupedDataFrame)
+    rows = zeros(Int, length(gdf))
+    isempty(rows) && return rows
+
+    filled = 0
+    i = 1
+    while filled < length(gdf)
+        group = gdf.groups[i]
+        if rows[group] == 0
+            rows[group] = i
+            filled += 1
+        end
+        i += 1
     end
-    return ref
+    return rows # return row index of first occurence of each group in gdf
 end
 
 function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
                   colkey::Int, g_colkey::GroupedDataFrame,
                   valuecol::AbstractVector, g_rowkey::GroupedDataFrame,
                   renamecols::Function,
                   allowmissing::Bool, allowduplicates::Bool)
-    rowref = getrefs(g_rowkey)
-    df1 = df[g_rowkey.idx[g_rowkey.starts], g_rowkey.cols]
+    rowref = g_rowkey.groups
+    df1 = df[find_group_row(g_rowkey), g_rowkey.cols]
     Nrow = length(g_rowkey)
 
     @assert groupcols(g_colkey) == _names(df)[colkey:colkey]
-    colref = getrefs(g_colkey)
+    colref = g_colkey.groups
     Ncol = length(g_colkey)
-    colref_map = df[g_colkey.starts, colkey]
+    colref_map = df[find_group_row(g_colkey), colkey]
 
     if any(ismissing, colref_map) && !allowmissing
         throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." *

diff --git a/test/reshape.jl b/test/reshape.jl
@@ -52,10 +52,9 @@ const ≅ = isequal
     df2 = unstack(df, :Fish, :Key, :Value)
     #Unstack without specifying a row column
     df3 = unstack(df, :Key, :Value)
-    #The expected output, XXX level should be dropped as it has no rows with this key
-    df4 = DataFrame(Fish = ["Batman", "Bob"],
-                    Color = ["Grey", "Red"],
-                    Mass = ["18 g", "12 g"])
+    df4 = DataFrame(Fish = ["Bob", "Batman"],
+                    Mass = ["12 g", "18 g"],
+                    Color = ["Red", "Grey"])
     @test df2 ≅ df4
     @test typeof(df2[!, :Fish]) <: Vector{String}
     # first column stays as CategoricalArray in df3
@@ -66,17 +65,17 @@ const ≅ = isequal
     df2 = unstack(df, :Fish, :Key, :Value)
     #This changes the expected result
     allowmissing!(df4, :Mass)
-    df4[2, :Mass] = missing
+    df4[1, :Mass] = missing
     @test df2 ≅ df4
 
     df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"],
                    Key = ["Mass", "Color", "Mass", "Color"],
                    Value = ["12 g", "Red", "18 g", "Grey"])
     df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
     df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
-    df4 = DataFrame(Fish = ["Batman", "Bob"],
-                    _COLOR_ = ["Grey", "Red"],
-                    _MASS_ = ["18 g", "12 g"])
+    df4 = DataFrame(Fish = ["Bob", "Batman"],
+                    _MASS_ = ["12 g", "18 g"],
+                    _COLOR_ = ["Red", "Grey"])
     @test df2 == df4
     @test df3 == df4
 
@@ -90,10 +89,10 @@ const ≅ = isequal
 
     # test missing value in grouping variable
     mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4)
-    @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
-    @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
-    @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
-    @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
+    @test unstack(stack(mdf, Not(:id)), :id, :variable, :value) ≅ mdf
+    @test unstack(stack(mdf, Not(1)), :id, :variable, :value) ≅ mdf
+    @test unstack(stack(mdf, Not(:id)), :id, :variable, :value) ≅ mdf
+    @test unstack(stack(mdf, Not(1)), :id, :variable, :value) ≅ mdf
 
     # test more than one grouping column
     wide = DataFrame(id = 1:12,
@@ -169,8 +168,8 @@ end
     @test_throws ArgumentError unstack(df, :variable, :value)
     @test_throws ArgumentError unstack(df, :variable, :value, allowmissing=true)
     udf = unstack(df, :variable, :value, allowmissing=true, renamecols=x -> coalesce(x, "MISSING"))
-    @test propertynames(udf) == [:id, :a, :b, :missing, :MISSING]
-    @test udf[!, :missing] ≅ [missing, 9.0, missing]
+    @test propertynames(udf) == [:id, :a, :b, :MISSING, :missing]
+    @test udf[!, :missing] ≅ [missing, missing, 9.0]
     @test udf[!, :MISSING] ≅ [3.0, missing, missing]
 
     df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
@@ -181,8 +180,8 @@ end
     @test_throws ArgumentError unstack(df, 3, 4, allowmissing=true)
     udf = unstack(df, 3, 4, allowmissing=true, renamecols=x -> coalesce(x, "MISSING"))
 
-    @test propertynames(udf) == [:id, :id2, :a, :b, :missing, :MISSING]
-    @test udf[!, :missing] ≅ [missing, 9.0, missing]
+    @test propertynames(udf) == [:id, :id2, :a, :b, :MISSING, :missing]
+    @test udf[!, :missing] ≅ [missing, missing, 9.0]
     @test udf[!, :MISSING] ≅ [3.0, missing, missing]
 end
 
@@ -499,12 +498,36 @@ end
     @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) ==
           DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
 
+    # an exertice on current unstack invariants
     Random.seed!(1234)
-    for i in 1:3
+    for i in 1:16
         df = df[Random.shuffle(1:9), :]
-        @test unstack(df, :id, :var, :val) == DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
-        @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) ==
+        @test unstack(df, :id, :var, :val)[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] ==
+              DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+        @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val)
+        @test unstack(df, :var, :val)[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] ==
+              DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+
+        df2 = copy(df)
+        df2.id = PooledArray(df.id)
+        df2.var = PooledArray(df.var)
+        @test unstack(df2, :id, :var, :val)[sortperm(df2.id.pool), [1; 1 .+ sortperm(df2.var.pool)]] ==
+              DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+        @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val)
+        @test unstack(df2, :var, :val)[sortperm(df2.id.pool), [1:2; 2 .+ sortperm(df2.var.pool)]] ==
+              DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+
+        df2 = categorical(df, 1:3)
+        @test unstack(df2, :id, :var, :val) ==
+              DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+        @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
               DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+        levels!(df2.id, [10, 2, 11, 3, 1, 12])
+        levels!(df2.var, ['x', 'b', 'y', 'c', 'a', 'z'])
+        @test unstack(df2, :id, :var, :val) ==
+              DataFrame(id=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
+        @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
+              DataFrame(id=1:3, id2=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
     end
 
     df = DataFrame(id=repeat(1:3, inner=3),