Skip to content
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* Joining functions now support `order` keyword argument allowing the user
to specify the order of the rows in the produced table
([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
* Add `nest`, `unnest`, `extract`, and `extract!` functions; improve `flatten`
([#3258](https://github.com/JuliaData/DataFrames.jl/pull/3258))

## Bug fixes

Expand Down
4 changes: 4 additions & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ rename!
```@docs
append!
combine
expand
expand!
fillcombinations
flatten
hcat
Expand All @@ -85,6 +87,7 @@ insertcols!
invpermute!
mapcols
mapcols!
nest
permute!
prepend!
push!
Expand All @@ -102,6 +105,7 @@ table_transformation
transform
transform!
vcat
unnest
```

## Reshaping data frames between tall and wide formats
Expand Down
5 changes: 5 additions & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ export AbstractDataFrame,
disallowmissing!,
dropmissing!,
dropmissing,
extract,
extract!,
fillcombinations,
flatten,
groupby,
Expand All @@ -76,6 +78,7 @@ export AbstractDataFrame,
mapcols,
mapcols!,
ncol,
nest,
nonunique,
nrow,
order,
Expand All @@ -95,6 +98,7 @@ export AbstractDataFrame,
transform,
transform!,
unique!,
unnest,
unstack,
valuecols,
metadata,
Expand Down Expand Up @@ -166,6 +170,7 @@ include("abstractdataframe/show.jl")
include("groupeddataframe/show.jl")
include("dataframerow/show.jl")
include("abstractdataframe/io.jl")
include("abstractdataframe/nest.jl")

include("other/tables.jl")
include("other/names.jl")
Expand Down
130 changes: 0 additions & 130 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2502,136 +2502,6 @@ function Missings.allowmissing(df::AbstractDataFrame,
return new_df
end

"""
flatten(df::AbstractDataFrame, cols)

When columns `cols` of data frame `df` have iterable elements that define
`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
element of each `col` in `cols` is flattened, meaning the column corresponding
to `col` becomes a longer vector where the original entries are concatenated.
Elements of row `i` of `df` in columns other than `cols` will be repeated
according to the length of `df[i, col]`. These lengths must therefore be the
same for each `col` in `cols`, or else an error is raised. Note that these
elements are not copied, and thus if they are mutable changing them in the
returned `DataFrame` will affect `df`.

`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).

$METADATA_FIXED

# Examples

```jldoctest
julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
2×3 DataFrame
Row │ a b c
│ Int64 Array… Array…
─────┼───────────────────────
1 │ 1 [1, 2] [5, 6]
2 │ 2 [3, 4] [7, 8]

julia> flatten(df1, :b)
4×3 DataFrame
Row │ a b c
│ Int64 Int64 Array…
─────┼──────────────────────
1 │ 1 1 [5, 6]
2 │ 1 2 [5, 6]
3 │ 2 3 [7, 8]
4 │ 2 4 [7, 8]

julia> flatten(df1, [:b, :c])
4×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 1 5
2 │ 1 2 6
3 │ 2 3 7
4 │ 2 4 8

julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
2×2 DataFrame
Row │ a b
│ Int64 Tuple…
─────┼───────────────────
1 │ 1 ("p", "q")
2 │ 2 ("r", "s")

julia> flatten(df2, :b)
4×2 DataFrame
Row │ a b
│ Int64 String
─────┼───────────────
1 │ 1 p
2 │ 1 q
3 │ 2 r
4 │ 2 s

julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
2×3 DataFrame
Row │ a b c
│ Int64 Array… Array…
─────┼───────────────────────
1 │ 1 [1, 2] [5, 6]
2 │ 2 [3, 4] [7]

julia> flatten(df3, [:b, :c])
ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
```
"""
function flatten(df::AbstractDataFrame,
cols::Union{ColumnIndex, MultiColumnIndex})
_check_consistency(df)

idxcols = index(df)[cols]
if isempty(idxcols)
cdf = copy(df)
_drop_all_nonnote_metadata!(cdf)
return cdf
end

col1 = first(idxcols)
lengths = length.(df[!, col1])
for col in idxcols
v = df[!, col]
if any(x -> length(x[1]) != x[2], zip(v, lengths))
r = findfirst(x -> x != 0, length.(v) .- lengths)
colnames = _names(df)
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
"and :$(colnames[col]) are not the same in row $r"))
end
end

new_df = similar(df[!, Not(cols)], sum(lengths))
for name in _names(new_df)
repeat_lengths!(new_df[!, name], df[!, name], lengths)
end
length(idxcols) > 1 && sort!(idxcols)
for col in idxcols
col_to_flatten = df[!, col]
fast_path = eltype(col_to_flatten) isa AbstractVector &&
!isempty(col_to_flatten)
flattened_col = fast_path ?
reduce(vcat, col_to_flatten) :
collect(Iterators.flatten(col_to_flatten))
insertcols!(new_df, col, _names(df)[col] => flattened_col)
end

_copy_all_note_metadata!(new_df, df)
return new_df
end

function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
lengths::AbstractVector{Int})
counter = 1
@inbounds for i in eachindex(shortold)
l = lengths[i]
longnew[counter:(counter + l - 1)] .= Ref(shortold[i])
counter += l
end
end

# Disallowed getindex and setindex! operations that are a common mistake

Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) =
Expand Down
Loading