Skip to content

Commit e07b08d

Browse files
authored
Implement permutedims (#2447)
1 parent f507944 commit e07b08d

File tree

5 files changed

+241
-19
lines changed

5 files changed

+241
-19
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
* add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449))
7777
* passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine`
7878
with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476))
79+
* add `permutedims` method for `AbstractDataFrame` ([#2447](https://github.com/JuliaData/DataFrames.jl/pull/2447))
7980

8081
## Deprecated
8182

docs/src/lib/functions.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ vcat
5757
```@docs
5858
stack
5959
unstack
60+
permutedims
6061
```
6162

6263
## Sorting

docs/src/man/reshaping_and_pivoting.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,3 +380,53 @@ julia> first(unstack(x, :Species, :vsum), 6)
380380
│ 4 │ PetalWidth │ 0.244 │ 1.326 │ 2.026 │
381381
│ 5 │ id │ 25.5 │ 75.5 │ 125.5 │
382382
```
383+
384+
To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref).
385+
386+
```jldoctest reshape
387+
julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false])
388+
2×4 DataFrame
389+
│ Row │ a │ b │ c │ d │
390+
│ │ String │ Float64 │ Int64 │ Bool │
391+
├─────┼────────┼─────────┼───────┼──────┤
392+
│ 1 │ x │ 1.0 │ 3 │ 1 │
393+
│ 2 │ y │ 2.0 │ 4 │ 0 │
394+
395+
julia> permutedims(df1, 1)
396+
3×3 DataFrame
397+
│ Row │ a │ x │ y │
398+
│ │ String │ Float64 │ Float64 │
399+
├─────┼────────┼─────────┼─────────┤
400+
│ 1 │ b │ 1.0 │ 2.0 │
401+
│ 2 │ c │ 3.0 │ 4.0 │
402+
│ 3 │ d │ 1.0 │ 0.0 │
403+
```
404+
405+
Note that the column indexed by `src_colnames` in the original `df`
406+
becomes the column names in the permuted result,
407+
and the column names of the original become a new column.
408+
Typically, this would be used on columns with homogenous element types,
409+
since the element types of the other columns
410+
are the result of `promote_type` on _all_ the permuted columns.
411+
Note also that, by default, the new column created from the column names
412+
of the original `df` has the same name as `src_namescol`.
413+
An optional positional argument `dest_namescol` can alter this:
414+
415+
```jldoctest reshape
416+
julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false])
417+
2×4 DataFrame
418+
│ Row │ a │ b │ c │ d │
419+
│ │ String │ Any │ Int64 │ Bool │
420+
├─────┼────────┼─────┼───────┼──────┤
421+
│ 1 │ x │ 1 │ 3 │ 1 │
422+
│ 2 │ y │ two │ 4 │ 0 │
423+
424+
julia> permutedims(df2, 1, "different_name")
425+
3×3 DataFrame
426+
│ Row │ different_name │ x │ y │
427+
│ │ String │ Any │ Any │
428+
├─────┼────────────────┼─────┼─────┤
429+
│ 1 │ b │ 1 │ two │
430+
│ 2 │ c │ 3 │ 4 │
431+
│ 3 │ d │ 1 │ 0 │
432+
```

src/abstractdataframe/reshape.jl

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,3 +399,107 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector)
399399
res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer])
400400
res
401401
end
402+
403+
404+
Base.transpose(::AbstractDataFrame, args...; kwargs...) =
405+
MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead")
406+
407+
"""
408+
permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, AbstractString},
409+
[dest_namescol::Union{Symbol, AbstractString}];
410+
makeunique::Bool=false)
411+
412+
Turn `df` on its side such that rows become columns
413+
and values in the column indexed by `src_namescol` become the names of new columns.
414+
In the resulting `DataFrame`, column names of `df` will become the first column
415+
with name specified by `dest_namescol`.
416+
417+
# Arguments
418+
- `df` : the `AbstractDataFrame`
419+
- `src_namescol` : the column that will become the new header.
420+
This column's element type must be `AbstractString` or `Symbol`.
421+
- `dest_namescol` : the name of the first column in the returned `DataFrame`.
422+
Defaults to the same name as `src_namescol`.
423+
- `makeunique` : if `false` (the default), an error will be raised
424+
if duplicate names are found; if `true`, duplicate names will be suffixed
425+
with `_i` (`i` starting at 1 for the first duplicate).
426+
427+
Note: The element types of columns in resulting `DataFrame`
428+
(other than the first column, which always has element type `String`)
429+
will depend on the element types of _all_ input columns
430+
based on the result of `promote_type`.
431+
That is, if the source data frame contains `Int` and `Float64` columns,
432+
resulting columns will have element type `Float64`. If the source has
433+
`Int` and `String` columns, resulting columns will have element type `Any`.
434+
435+
# Examples
436+
437+
```jldoctest
438+
julia> df1 = DataFrame(a=["x", "y"], b=[1., 2.], c=[3, 4], d=[true,false])
439+
2×4 DataFrame
440+
│ Row │ a │ b │ c │ d │
441+
│ │ String │ Float64 │ Int64 │ Bool │
442+
├─────┼────────┼─────────┼───────┼──────┤
443+
│ 1 │ x │ 1.0 │ 3 │ 1 │
444+
│ 2 │ y │ 2.0 │ 4 │ 0 │
445+
446+
julia> permutedims(df1, 1) # note the column types
447+
3×3 DataFrame
448+
│ Row │ a │ x │ y │
449+
│ │ String │ Float64 │ Float64 │
450+
├─────┼────────┼─────────┼─────────┤
451+
│ 1 │ b │ 1.0 │ 2.0 │
452+
│ 2 │ c │ 3.0 │ 4.0 │
453+
│ 3 │ d │ 1.0 │ 0.0 │
454+
455+
julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false])
456+
2×4 DataFrame
457+
│ Row │ a │ b │ c │ d │
458+
│ │ String │ Any │ Int64 │ Bool │
459+
├─────┼────────┼─────┼───────┼──────┤
460+
│ 1 │ x │ 1 │ 3 │ 1 │
461+
│ 2 │ y │ two │ 4 │ 0 │
462+
463+
julia> permutedims(df2, 1, "different_name")
464+
3×3 DataFrame
465+
│ Row │ different_name │ x │ y │
466+
│ │ String │ Any │ Any │
467+
├─────┼────────────────┼─────┼─────┤
468+
│ 1 │ b │ 1 │ two │
469+
│ 2 │ c │ 3 │ 4 │
470+
│ 3 │ d │ 1 │ 0 │
471+
```
472+
"""
473+
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
474+
dest_namescol::Union{Symbol, AbstractString};
475+
makeunique::Bool=false)
476+
477+
if src_namescol isa Integer
478+
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
479+
end
480+
eltype(df[!, src_namescol]) <: SymbolOrString ||
481+
throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`"))
482+
483+
df_notsrc = df[!, Not(src_namescol)]
484+
df_permuted = DataFrame(dest_namescol => names(df_notsrc))
485+
486+
if ncol(df_notsrc) == 0
487+
df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], df[!, src_namescol],
488+
makeunique=makeunique, copycols=false)
489+
else
490+
m = permutedims(Matrix(df_notsrc))
491+
df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique)
492+
end
493+
return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false)
494+
end
495+
496+
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex;
497+
makeunique::Bool=false)
498+
if src_namescol isa Integer
499+
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
500+
dest_namescol = _names(df)[src_namescol]
501+
else
502+
dest_namescol = src_namescol
503+
end
504+
return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique)
505+
end

test/reshape.jl

Lines changed: 85 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ const ≅ = isequal
2525
# first column stays as CategoricalArray in df3
2626
@test df3 == df4
2727
#Make sure unstack works with missing values at the start of the value column
28-
df[1,:Value] = missing
28+
df[1, :Value] = missing
2929
df2 = unstack(df, :Fish, :Key, :Value)
3030
#This changes the expected result
31-
df4[1,:Mass] = missing
31+
df4[1, :Mass] = missing
3232
@test df2 df4
3333

3434
df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]),
@@ -62,11 +62,11 @@ const ≅ = isequal
6262
@test df3 == df4
6363
#Make sure unstack works with missing values at the start of the value column
6464
allowmissing!(df, :Value)
65-
df[1,:Value] = missing
65+
df[1, :Value] = missing
6666
df2 = unstack(df, :Fish, :Key, :Value)
6767
#This changes the expected result
6868
allowmissing!(df4, :Mass)
69-
df4[2,:Mass] = missing
69+
df4[2, :Mass] = missing
7070
@test df2 df4
7171

7272
df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"],
@@ -89,9 +89,9 @@ const ≅ = isequal
8989
@test_throws TypeError unstack(df, :Key, :Value, renamecols=Symbol)
9090

9191
# test missing value in grouping variable
92-
mdf = DataFrame(id=[missing,1,2,3], a=1:4, b=1:4)
93-
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:]
94-
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:]
92+
mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4)
93+
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
94+
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
9595
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
9696
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
9797

@@ -158,7 +158,7 @@ end
158158
b = unstack(df, :variable, :value)
159159
@test a b DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4])
160160

161-
df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1])
161+
df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1])
162162
@test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value)
163163
@test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value)
164164
end
@@ -225,14 +225,14 @@ end
225225
@test d1s2 == d1s3
226226
@test propertynames(d1s) == [:c, :d, :e, :variable, :value]
227227
@test d1s == d1m
228-
d1m = stack(d1[:, [1,3,4]], Not(:a))
228+
d1m = stack(d1[:, [1, 3, 4]], Not(:a))
229229
@test propertynames(d1m) == [:a, :variable, :value]
230230

231231
# Test naming of measure/value columns
232232
d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval)
233233
@test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval)
234234
@test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval]
235-
d1m_named = stack(d1[:, [1,3,4]], Not(:a), variable_name=:letter, value_name=:someval)
235+
d1m_named = stack(d1[:, [1, 3, 4]], Not(:a), variable_name=:letter, value_name=:someval)
236236
@test propertynames(d1m_named) == [:a, :letter, :someval]
237237

238238
# test empty measures or ids
@@ -270,21 +270,21 @@ end
270270
@test d1s[!, 5] isa DataFrames.StackedVector
271271
@test ndims(d1s[!, 5]) == 1
272272
@test ndims(typeof(d1s[!, 2])) == 1
273-
@test d1s[!, 4][[1,24]] == ["a", "b"]
274-
@test d1s[!, 5][[1,24]] == [1, 4]
273+
@test d1s[!, 4][[1, 24]] == ["a", "b"]
274+
@test d1s[!, 5][[1, 24]] == [1, 4]
275275
@test_throws ArgumentError d1s[!, 4][true]
276276
@test_throws ArgumentError d1s[!, 5][true]
277277
@test_throws ArgumentError d1s[!, 4][1.0]
278278
@test_throws ArgumentError d1s[!, 5][1.0]
279279

280280
d1ss = stack(d1, [:a, :b], view=true)
281-
@test d1ss[!, 4][[1,24]] == ["a", "b"]
281+
@test d1ss[!, 4][[1, 24]] == ["a", "b"]
282282
@test d1ss[!, 4] isa DataFrames.RepeatedVector
283283
d1ss = stack(d1, [:a, :b], view=true, variable_eltype=String)
284-
@test d1ss[!, 4][[1,24]] == ["a", "b"]
284+
@test d1ss[!, 4][[1, 24]] == ["a", "b"]
285285
@test d1ss[!, 4] isa DataFrames.RepeatedVector
286286
d1ss = stack(d1, [:a, :b], view=true, variable_eltype=Symbol)
287-
@test d1ss[!, 4][[1,24]] == [:a, :b]
287+
@test d1ss[!, 4][[1, 24]] == [:a, :b]
288288
@test d1ss[!, 4] isa DataFrames.RepeatedVector
289289

290290
# Those tests check indexing RepeatedVector/StackedVector by a vector
@@ -307,7 +307,7 @@ end
307307
@test d1s2 == d1s3
308308
@test propertynames(d1s) == [:c, :d, :e, :variable, :value]
309309
@test d1s == d1m
310-
d1m = stack(d1[:, [1,3,4]], Not(:a), view=true)
310+
d1m = stack(d1[:, [1, 3, 4]], Not(:a), view=true)
311311
@test propertynames(d1m) == [:a, :variable, :value]
312312

313313
d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true)
@@ -329,13 +329,13 @@ end
329329
@test d1us3 == unstack(d1s2)
330330

331331
# test unstack with exactly one key column that is not passed
332-
df1 = stack(DataFrame(rand(10,10)))
332+
df1 = stack(DataFrame(rand(10, 10)))
333333
df1[!, :id] = 1:100
334334
@test size(unstack(df1, :variable, :value)) == (100, 11)
335335
@test unstack(df1, :variable, :value) unstack(df1)
336336

337337
# test empty keycol
338-
@test_throws ArgumentError unstack(stack(DataFrame(rand(3,2))), :variable, :value)
338+
@test_throws ArgumentError unstack(stack(DataFrame(rand(3, 2))), :variable, :value)
339339
end
340340

341341
@testset "column names duplicates" begin
@@ -494,7 +494,7 @@ end
494494
end
495495

496496
@testset "test stack eltype" begin
497-
df = DataFrame(rand(4,5))
497+
df = DataFrame(rand(4, 5))
498498
sdf = stack(df)
499499
@test eltype(sdf.variable) === String
500500
@test eltype(typeof(sdf.variable)) === String
@@ -507,4 +507,70 @@ end
507507
@test eltype(typeof(sdf2.value)) === Float64
508508
end
509509

510+
@testset "permutedims" begin
511+
df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2))
512+
513+
@test_throws MethodError transpose(df1)
514+
@test_throws ArgumentError permutedims(df1, :bar)
515+
516+
df1_pd = permutedims(df1, 1)
517+
@test size(df1_pd, 1) == ncol(df1) - 1
518+
@test size(df1_pd, 2) == nrow(df1) + 1
519+
@test names(df1_pd) == ["a", "x", "y"]
520+
@test df1_pd == permutedims(df1, :a) == permutedims(df1, 1)
521+
@test names(permutedims(df1, :a, :foo)) == ["foo", "x", "y"]
522+
523+
orignames1 = names(df1)[2:end]
524+
for (i, row) in enumerate(eachrow(df1_pd))
525+
@test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]]
526+
end
527+
528+
# All columns should be promoted
529+
@test eltype(df1_pd.x) == Float64
530+
@test eltype(df1_pd.y) == Float64
531+
532+
df2 = DataFrame(a=["x", "y"], b=[1.0, "str"], c=[1, 2], d=rand(Bool, 2))
533+
534+
df2_pd = permutedims(df2, :a)
535+
@test size(df2_pd, 1) == ncol(df2) - 1
536+
@test size(df2_pd, 2) == nrow(df2) + 1
537+
@test names(df2_pd) == ["a", "x", "y"]
538+
539+
orignames2 = names(df2)[2:end]
540+
for (i, row) in enumerate(eachrow(df2_pd))
541+
@test Vector(row) == [orignames2[i]; df2[!, orignames2[i]]]
542+
end
543+
@test Any == eltype(df2_pd.x)
544+
@test Any == eltype(df2_pd.y)
545+
546+
df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10))
547+
548+
d3pd_names = ["a", "x", ("x_$i" for i in 1:9)...]
549+
@test_throws ArgumentError permutedims(df3, 1)
550+
@test names(permutedims(df3, 1, makeunique=true)) == d3pd_names
551+
@test_throws ArgumentError permutedims(df3[!, [:a]], 1) # single column branch
552+
@test names(permutedims(df3[!, [:a]], 1, makeunique=true)) == d3pd_names
553+
554+
df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1., missing],
555+
e=["x", "y"], f=[:x, :y], # valid src
556+
g=[missing, "y"], h=Union{Missing, String}["x", "y"] # invalid src
557+
)
558+
559+
@test permutedims(df4[!, [:a, :b, :c, :e]], :e) ==
560+
permutedims(df4[!, [:e, :a, :b, :c]], 1) ==
561+
permutedims(df4[!, [:a, :b, :c, :f]], :f, :e)
562+
# Can permute single-column
563+
@test permutedims(df4[!, [:e]], 1) == DataFrame(e=String[], x=[], y=[])
564+
# Can't index float Column
565+
@test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]], 1)
566+
@test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[]), 1)
567+
# Can't index columns that allow for missing
568+
@test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]], 1)
569+
@test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]], 1)
570+
# Can't permute empty `df` ...
571+
@test_throws BoundsError permutedims(DataFrame(), 1)
572+
# ... but can permute zero-row df
573+
@test permutedims(DataFrame(a=String[], b=Float64[]), 1) == DataFrame(a=["b"])
574+
end
575+
510576
end # module

0 commit comments

Comments
 (0)