JuliaData · bkamins · Dec 28, 2022 · Dec 28, 2022 · Jan 5, 2023 · Jan 5, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -15,6 +15,8 @@
 * Joining functions now support `order` keyword argument allowing the user
   to specify the order of the rows in the produced table
   ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
+* Add `nest`, `unnest`, `extract`, and `extract!` functions; improve `flatten`
+  ([#3258](https://github.com/JuliaData/DataFrames.jl/pull/3258))
 
 ## Bug fixes
 

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -76,6 +76,8 @@ rename!
 ```@docs
 append!
 combine
+expand
+expand!
 fillcombinations
 flatten
 hcat
@@ -85,6 +87,7 @@ insertcols!
 invpermute!
 mapcols
 mapcols!
+nest
 permute!
 prepend!
 push!
@@ -102,6 +105,7 @@ table_transformation
 transform
 transform!
 vcat
+unnest
 ```
 
 ## Reshaping data frames between tall and wide formats

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -63,6 +63,8 @@ export AbstractDataFrame,
        disallowmissing!,
        dropmissing!,
        dropmissing,
+       extract,
+       extract!,
        fillcombinations,
        flatten,
        groupby,
@@ -76,6 +78,7 @@ export AbstractDataFrame,
        mapcols,
        mapcols!,
        ncol,
+       nest,
        nonunique,
        nrow,
        order,
@@ -95,6 +98,7 @@ export AbstractDataFrame,
        transform,
        transform!,
        unique!,
+       unnest,
        unstack,
        valuecols,
        metadata,
@@ -166,6 +170,7 @@ include("abstractdataframe/show.jl")
 include("groupeddataframe/show.jl")
 include("dataframerow/show.jl")
 include("abstractdataframe/io.jl")
+include("abstractdataframe/nest.jl")
 
 include("other/tables.jl")
 include("other/names.jl")

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -2502,136 +2502,6 @@ function Missings.allowmissing(df::AbstractDataFrame,
     return new_df
 end
 
-"""
-    flatten(df::AbstractDataFrame, cols)
-
-When columns `cols` of data frame `df` have iterable elements that define
-`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
-element of each `col` in `cols` is flattened, meaning the column corresponding
-to `col` becomes a longer vector where the original entries are concatenated.
-Elements of row `i` of `df` in columns other than `cols` will be repeated
-according to the length of `df[i, col]`. These lengths must therefore be the
-same for each `col` in `cols`, or else an error is raised. Note that these
-elements are not copied, and thus if they are mutable changing them in the
-returned `DataFrame` will affect `df`.
-
-`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
-
-$METADATA_FIXED
-
-# Examples
-
-```jldoctest
-julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
-2×3 DataFrame
- Row │ a      b       c
-     │ Int64  Array…  Array…
-─────┼───────────────────────
-   1 │     1  [1, 2]  [5, 6]
-   2 │     2  [3, 4]  [7, 8]
-
-julia> flatten(df1, :b)
-4×3 DataFrame
- Row │ a      b      c
-     │ Int64  Int64  Array…
-─────┼──────────────────────
-   1 │     1      1  [5, 6]
-   2 │     1      2  [5, 6]
-   3 │     2      3  [7, 8]
-   4 │     2      4  [7, 8]
-
-julia> flatten(df1, [:b, :c])
-4×3 DataFrame
- Row │ a      b      c
-     │ Int64  Int64  Int64
-─────┼─────────────────────
-   1 │     1      1      5
-   2 │     1      2      6
-   3 │     2      3      7
-   4 │     2      4      8
-
-julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
-2×2 DataFrame
- Row │ a      b
-     │ Int64  Tuple…
-─────┼───────────────────
-   1 │     1  ("p", "q")
-   2 │     2  ("r", "s")
-
-julia> flatten(df2, :b)
-4×2 DataFrame
- Row │ a      b
-     │ Int64  String
-─────┼───────────────
-   1 │     1  p
-   2 │     1  q
-   3 │     2  r
-   4 │     2  s
-
-julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
-2×3 DataFrame
- Row │ a      b       c
-     │ Int64  Array…  Array…
-─────┼───────────────────────
-   1 │     1  [1, 2]  [5, 6]
-   2 │     2  [3, 4]  [7]
-
-julia> flatten(df3, [:b, :c])
-ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
-```
-"""
-function flatten(df::AbstractDataFrame,
-                 cols::Union{ColumnIndex, MultiColumnIndex})
-    _check_consistency(df)
-
-    idxcols = index(df)[cols]
-    if isempty(idxcols)
-        cdf = copy(df)
-        _drop_all_nonnote_metadata!(cdf)
-        return cdf
-    end
-
-    col1 = first(idxcols)
-    lengths = length.(df[!, col1])
-    for col in idxcols
-        v = df[!, col]
-        if any(x -> length(x[1]) != x[2], zip(v, lengths))
-            r = findfirst(x -> x != 0, length.(v) .- lengths)
-            colnames = _names(df)
-            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
-                                "and :$(colnames[col]) are not the same in row $r"))
-        end
-    end
-
-    new_df = similar(df[!, Not(cols)], sum(lengths))
-    for name in _names(new_df)
-        repeat_lengths!(new_df[!, name], df[!, name], lengths)
-    end
-    length(idxcols) > 1 && sort!(idxcols)
-    for col in idxcols
-        col_to_flatten = df[!, col]
-        fast_path = eltype(col_to_flatten) isa AbstractVector &&
-                    !isempty(col_to_flatten)
-        flattened_col = fast_path ?
-            reduce(vcat, col_to_flatten) :
-            collect(Iterators.flatten(col_to_flatten))
-        insertcols!(new_df, col, _names(df)[col] => flattened_col)
-    end
-
-    _copy_all_note_metadata!(new_df, df)
-    return new_df
-end
-
-function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
-                         lengths::AbstractVector{Int})
-    counter = 1
-    @inbounds for i in eachindex(shortold)
-        l = lengths[i]
-        longnew[counter:(counter + l - 1)] .= Ref(shortold[i])
-        counter += l
-    end
-end
-
 # Disallowed getindex and setindex! operations that are a common mistake
 
 Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) =