Implement permutedims (#2447)

kescobo · web-flow · commit e07b08da25eb · 2020-10-16T17:54:02.000+02:00
diff --git a/NEWS.md b/NEWS.md
@@ -76,6 +76,7 @@
 * add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449))
 * passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine`
   with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476))
+* add `permutedims` method for `AbstractDataFrame` ([#2447](https://github.com/JuliaData/DataFrames.jl/pull/2447))
 
 ## Deprecated
 
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -57,6 +57,7 @@ vcat
 ```@docs
 stack
 unstack
+permutedims
 ```
 
 ## Sorting
diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md
@@ -380,3 +380,53 @@ julia> first(unstack(x, :Species, :vsum), 6)
 │ 4   │ PetalWidth  │ 0.244       │ 1.326           │ 2.026          │
 │ 5   │ id          │ 25.5        │ 75.5            │ 125.5          │
 ```
+
+To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref).
+
+```jldoctest reshape
+julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false])
+2×4 DataFrame
+│ Row │ a      │ b       │ c     │ d    │
+│     │ String │ Float64 │ Int64 │ Bool │
+├─────┼────────┼─────────┼───────┼──────┤
+│ 1   │ x      │ 1.0     │ 3     │ 1    │
+│ 2   │ y      │ 2.0     │ 4     │ 0    │
+
+julia> permutedims(df1, 1)
+3×3 DataFrame
+│ Row │ a      │ x       │ y       │
+│     │ String │ Float64 │ Float64 │
+├─────┼────────┼─────────┼─────────┤
+│ 1   │ b      │ 1.0     │ 2.0     │
+│ 2   │ c      │ 3.0     │ 4.0     │
+│ 3   │ d      │ 1.0     │ 0.0     │
+```
+
+Note that the column indexed by `src_colnames` in the original `df`
+becomes the column names in the permuted result,
+and the column names of the original become a new column.
+Typically, this would be used on columns with homogenous element types,
+since the element types of the other columns
+are the result of `promote_type` on _all_ the permuted columns.
+Note also that, by default, the new column created from the column names
+of the original `df` has the same name as `src_namescol`.
+An optional positional argument `dest_namescol` can alter this:
+
+```jldoctest reshape
+julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false])
+2×4 DataFrame
+│ Row │ a      │ b   │ c     │ d    │
+│     │ String │ Any │ Int64 │ Bool │
+├─────┼────────┼─────┼───────┼──────┤
+│ 1   │ x      │ 1   │ 3     │ 1    │
+│ 2   │ y      │ two │ 4     │ 0    │
+
+julia> permutedims(df2, 1, "different_name")
+3×3 DataFrame
+│ Row │ different_name │ x   │ y   │
+│     │ String         │ Any │ Any │
+├─────┼────────────────┼─────┼─────┤
+│ 1   │ b              │ 1   │ two │
+│ 2   │ c              │ 3   │ 4   │
+│ 3   │ d              │ 1   │ 0   │
+```
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -399,3 +399,107 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector)
     res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer])
     res
 end
+
+
+Base.transpose(::AbstractDataFrame, args...; kwargs...) =
+    MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead")
+
+"""
+    permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, AbstractString},
+                [dest_namescol::Union{Symbol, AbstractString}];
+                makeunique::Bool=false)
+
+Turn `df` on its side such that rows become columns
+and values in the column indexed by `src_namescol` become the names of new columns.
+In the resulting `DataFrame`, column names of `df` will become the first column
+with name specified by `dest_namescol`.
+
+# Arguments
+- `df` : the `AbstractDataFrame`
+- `src_namescol` : the column that will become the new header.
+  This column's element type must be `AbstractString` or `Symbol`.
+- `dest_namescol` : the name of the first column in the returned `DataFrame`.
+  Defaults to the same name as `src_namescol`.
+- `makeunique` : if `false` (the default), an error will be raised
+  if duplicate names are found; if `true`, duplicate names will be suffixed
+  with `_i` (`i` starting at 1 for the first duplicate).
+
+Note: The element types of columns in resulting `DataFrame`
+(other than the first column, which always has element type `String`)
+will depend on the element types of _all_ input columns
+based on the result of `promote_type`.
+That is, if the source data frame contains `Int` and `Float64` columns,
+resulting columns will have element type `Float64`. If the source has
+`Int` and `String` columns, resulting columns will have element type `Any`.
+
+# Examples
+
+```jldoctest
+julia> df1 = DataFrame(a=["x", "y"], b=[1., 2.], c=[3, 4], d=[true,false])
+2×4 DataFrame
+│ Row │ a      │ b       │ c     │ d    │
+│     │ String │ Float64 │ Int64 │ Bool │
+├─────┼────────┼─────────┼───────┼──────┤
+│ 1   │ x      │ 1.0     │ 3     │ 1    │
+│ 2   │ y      │ 2.0     │ 4     │ 0    │
+
+julia> permutedims(df1, 1) # note the column types
+3×3 DataFrame
+│ Row │ a      │ x       │ y       │
+│     │ String │ Float64 │ Float64 │
+├─────┼────────┼─────────┼─────────┤
+│ 1   │ b      │ 1.0     │ 2.0     │
+│ 2   │ c      │ 3.0     │ 4.0     │
+│ 3   │ d      │ 1.0     │ 0.0     │
+
+julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false])
+2×4 DataFrame
+│ Row │ a      │ b   │ c     │ d    │
+│     │ String │ Any │ Int64 │ Bool │
+├─────┼────────┼─────┼───────┼──────┤
+│ 1   │ x      │ 1   │ 3     │ 1    │
+│ 2   │ y      │ two │ 4     │ 0    │
+
+julia> permutedims(df2, 1, "different_name")
+3×3 DataFrame
+│ Row │ different_name │ x   │ y   │
+│     │ String         │ Any │ Any │
+├─────┼────────────────┼─────┼─────┤
+│ 1   │ b              │ 1   │ two │
+│ 2   │ c              │ 3   │ 4   │
+│ 3   │ d              │ 1   │ 0   │
+```
+"""
+function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
+                          dest_namescol::Union{Symbol, AbstractString};
+                          makeunique::Bool=false)
+
+    if src_namescol isa Integer
+        1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
+    end
+    eltype(df[!, src_namescol]) <: SymbolOrString ||
+        throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`"))
+
+    df_notsrc = df[!, Not(src_namescol)]
+    df_permuted = DataFrame(dest_namescol => names(df_notsrc))
+
+    if ncol(df_notsrc) == 0
+        df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], df[!, src_namescol],
+                           makeunique=makeunique, copycols=false)
+    else
+        m = permutedims(Matrix(df_notsrc))
+        df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique)
+    end
+    return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false)
+end
+
+function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex;
+                          makeunique::Bool=false)
+    if src_namescol isa Integer
+        1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
+        dest_namescol = _names(df)[src_namescol]
+    else
+        dest_namescol = src_namescol
+    end
+    return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique)
+end
diff --git a/test/reshape.jl b/test/reshape.jl
@@ -25,10 +25,10 @@ const ≅ = isequal
     # first column stays as CategoricalArray in df3
     @test df3 == df4
     #Make sure unstack works with missing values at the start of the value column
-    df[1,:Value] = missing
+    df[1, :Value] = missing
     df2 = unstack(df, :Fish, :Key, :Value)
     #This changes the expected result
-    df4[1,:Mass] = missing
+    df4[1, :Mass] = missing
     @test df2 ≅ df4
 
     df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]),
@@ -62,11 +62,11 @@ const ≅ = isequal
     @test df3 == df4
     #Make sure unstack works with missing values at the start of the value column
     allowmissing!(df, :Value)
-    df[1,:Value] = missing
+    df[1, :Value] = missing
     df2 = unstack(df, :Fish, :Key, :Value)
     #This changes the expected result
     allowmissing!(df4, :Mass)
-    df4[2,:Mass] = missing
+    df4[2, :Mass] = missing
     @test df2 ≅ df4
 
     df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"],
@@ -89,9 +89,9 @@ const ≅ = isequal
     @test_throws TypeError unstack(df, :Key, :Value, renamecols=Symbol)
 
     # test missing value in grouping variable
-    mdf = DataFrame(id=[missing,1,2,3], a=1:4, b=1:4)
-    @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:]
-    @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:]
+    mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4)
+    @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
+    @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
     @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
     @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
 
@@ -158,7 +158,7 @@ end
     b = unstack(df, :variable, :value)
     @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4])
 
-    df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1])
+    df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1])
     @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value)
     @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value)
 end
@@ -225,14 +225,14 @@ end
     @test d1s2 == d1s3
     @test propertynames(d1s) == [:c, :d, :e, :variable, :value]
     @test d1s == d1m
-    d1m = stack(d1[:, [1,3,4]], Not(:a))
+    d1m = stack(d1[:, [1, 3, 4]], Not(:a))
     @test propertynames(d1m) == [:a, :variable, :value]
 
     # Test naming of measure/value columns
     d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval)
     @test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval)
     @test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval]
-    d1m_named = stack(d1[:, [1,3,4]], Not(:a), variable_name=:letter, value_name=:someval)
+    d1m_named = stack(d1[:, [1, 3, 4]], Not(:a), variable_name=:letter, value_name=:someval)
     @test propertynames(d1m_named) == [:a, :letter, :someval]
 
     # test empty measures or ids
@@ -270,21 +270,21 @@ end
     @test d1s[!, 5] isa DataFrames.StackedVector
     @test ndims(d1s[!, 5]) == 1
     @test ndims(typeof(d1s[!, 2])) == 1
-    @test d1s[!, 4][[1,24]] == ["a", "b"]
-    @test d1s[!, 5][[1,24]] == [1, 4]
+    @test d1s[!, 4][[1, 24]] == ["a", "b"]
+    @test d1s[!, 5][[1, 24]] == [1, 4]
     @test_throws ArgumentError d1s[!, 4][true]
     @test_throws ArgumentError d1s[!, 5][true]
     @test_throws ArgumentError d1s[!, 4][1.0]
     @test_throws ArgumentError d1s[!, 5][1.0]
 
     d1ss = stack(d1, [:a, :b], view=true)
-    @test d1ss[!, 4][[1,24]] == ["a", "b"]
+    @test d1ss[!, 4][[1, 24]] == ["a", "b"]
     @test d1ss[!, 4] isa DataFrames.RepeatedVector
     d1ss = stack(d1, [:a, :b], view=true, variable_eltype=String)
-    @test d1ss[!, 4][[1,24]] == ["a", "b"]
+    @test d1ss[!, 4][[1, 24]] == ["a", "b"]
     @test d1ss[!, 4] isa DataFrames.RepeatedVector
     d1ss = stack(d1, [:a, :b], view=true, variable_eltype=Symbol)
-    @test d1ss[!, 4][[1,24]] == [:a, :b]
+    @test d1ss[!, 4][[1, 24]] == [:a, :b]
     @test d1ss[!, 4] isa DataFrames.RepeatedVector
 
     # Those tests check indexing RepeatedVector/StackedVector by a vector
@@ -307,7 +307,7 @@ end
     @test d1s2 == d1s3
     @test propertynames(d1s) == [:c, :d, :e, :variable, :value]
     @test d1s == d1m
-    d1m = stack(d1[:, [1,3,4]], Not(:a), view=true)
+    d1m = stack(d1[:, [1, 3, 4]], Not(:a), view=true)
     @test propertynames(d1m) == [:a, :variable, :value]
 
     d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true)
@@ -329,13 +329,13 @@ end
     @test d1us3 == unstack(d1s2)
 
     # test unstack with exactly one key column that is not passed
-    df1 = stack(DataFrame(rand(10,10)))
+    df1 = stack(DataFrame(rand(10, 10)))
     df1[!, :id] = 1:100
     @test size(unstack(df1, :variable, :value)) == (100, 11)
     @test unstack(df1, :variable, :value) ≅ unstack(df1)
 
     # test empty keycol
-    @test_throws ArgumentError unstack(stack(DataFrame(rand(3,2))), :variable, :value)
+    @test_throws ArgumentError unstack(stack(DataFrame(rand(3, 2))), :variable, :value)
 end
 
 @testset "column names duplicates" begin
@@ -494,7 +494,7 @@ end
 end
 
 @testset "test stack eltype" begin
-    df = DataFrame(rand(4,5))
+    df = DataFrame(rand(4, 5))
     sdf = stack(df)
     @test eltype(sdf.variable) === String
     @test eltype(typeof(sdf.variable)) === String
@@ -507,4 +507,70 @@ end
     @test eltype(typeof(sdf2.value)) === Float64
 end
 
+@testset "permutedims" begin
+    df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2))
+
+    @test_throws MethodError transpose(df1)
+    @test_throws ArgumentError permutedims(df1, :bar)
+
+    df1_pd = permutedims(df1, 1)
+    @test size(df1_pd, 1) == ncol(df1) - 1
+    @test size(df1_pd, 2) == nrow(df1) + 1
+    @test names(df1_pd) == ["a", "x", "y"]
+    @test df1_pd == permutedims(df1, :a) == permutedims(df1, 1)
+    @test names(permutedims(df1, :a, :foo)) == ["foo", "x", "y"]
+
+    orignames1 = names(df1)[2:end]
+    for (i, row) in enumerate(eachrow(df1_pd))
+        @test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]]
+    end
+
+    # All columns should be promoted
+    @test eltype(df1_pd.x) == Float64
+    @test eltype(df1_pd.y) == Float64
+
+    df2 = DataFrame(a=["x", "y"], b=[1.0, "str"], c=[1, 2], d=rand(Bool, 2))
+
+    df2_pd = permutedims(df2, :a)
+    @test size(df2_pd, 1) == ncol(df2) - 1
+    @test size(df2_pd, 2) == nrow(df2) + 1
+    @test names(df2_pd) == ["a", "x", "y"]
+
+    orignames2 = names(df2)[2:end]
+    for (i, row) in enumerate(eachrow(df2_pd))
+        @test Vector(row) == [orignames2[i]; df2[!, orignames2[i]]]
+    end
+    @test Any == eltype(df2_pd.x)
+    @test Any == eltype(df2_pd.y)
+
+    df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10))
+
+    d3pd_names = ["a", "x", ("x_$i" for i in 1:9)...]
+    @test_throws ArgumentError permutedims(df3, 1)
+    @test names(permutedims(df3, 1, makeunique=true)) == d3pd_names
+    @test_throws ArgumentError permutedims(df3[!, [:a]], 1) # single column branch
+    @test names(permutedims(df3[!, [:a]], 1, makeunique=true)) == d3pd_names
+
+    df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1., missing],
+                    e=["x", "y"], f=[:x, :y], # valid src
+                    g=[missing, "y"], h=Union{Missing, String}["x", "y"] # invalid src
+                    )
+
+    @test permutedims(df4[!, [:a, :b, :c, :e]], :e) ==
+          permutedims(df4[!, [:e, :a, :b, :c]], 1) ==
+          permutedims(df4[!, [:a, :b, :c, :f]], :f, :e)
+    # Can permute single-column
+    @test permutedims(df4[!, [:e]], 1) == DataFrame(e=String[], x=[], y=[])
+    # Can't index float Column
+    @test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]], 1)
+    @test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[]), 1)
+    # Can't index columns that allow for missing
+    @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]], 1)
+    @test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]], 1)
+    # Can't permute empty `df` ...
+    @test_throws BoundsError permutedims(DataFrame(), 1)
+    # ... but can permute zero-row df
+    @test permutedims(DataFrame(a=String[], b=Float64[]), 1) == DataFrame(a=["b"])
+end
+
 end # module