Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 64 additions & 59 deletions src/nests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,64 +7,78 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U
column_symbols = names(df_copy, Cols(cols_expr...))

for col in column_symbols
col_type = typeof(df_copy[1, col])
non_missing_idx = findfirst(x -> x !== missing, df_copy[!, col])
col_type = non_missing_idx === nothing ? Missing : typeof(df_copy[non_missing_idx, col])

if col_type <: DataFrame
# Handling DataFrames
nested_col_names = unique([name for i in 1:nrow(df_copy) for name in names(df_copy[i, col])])
nested_col_names = unique([name for i in 1:nrow(df_copy) for name in names(df_copy[i, col])])

for nested_col in nested_col_names
new_col_name = names_sep === nothing ? nested_col : Symbol(string(col, names_sep, nested_col))
combined_nested_col = Any[missing for _ in 1:nrow(df_copy)]
for nested_col in nested_col_names
new_col_name = names_sep === nothing ? nested_col : Symbol(string(col, names_sep, nested_col))
combined_nested_col = Any[missing for _ in 1:nrow(df_copy)]

for row in 1:nrow(df_copy)
nested_df = df_copy[row, col]
if ncol(nested_df) > 0 && haskey(nested_df[1, :], nested_col)
combined_nested_col[row] = nested_df[!, nested_col]
# Extract single value if there's only one element
if length(combined_nested_col[row]) == 1
combined_nested_col[row] = combined_nested_col[row][1]
end
end
end
df_copy[!, new_col_name] = combined_nested_col
end
elseif col_type <: NamedTuple || col_type <: Union{NamedTuple, Missing}
# Handling NamedTuples and missing values
keys_set = Set{Symbol}()
for item in df_copy[!, col]
if item !== missing
union!(keys_set, keys(item))
end
end

for key in keys_set
new_col_name = names_sep === nothing ? key : Symbol(string(col, names_sep, key))
df_copy[!, new_col_name] = [item !== missing ? get(item, key, missing) : missing for item in df_copy[!, col]]
end


elseif col_type <: Dict
keys_set = Set{String}()
for row in 1:nrow(df_copy)
nested_df = df_copy[row, col]
if ncol(nested_df) > 0 && haskey(nested_df[1, :], nested_col)
combined_nested_col[row] = nested_df[!, nested_col]
# Extract a single value if there's only one element
if length(combined_nested_col[row]) == 1
combined_nested_col[row] = combined_nested_col[row][1]
end
end
end
df_copy[!, new_col_name] = combined_nested_col
end

elseif col_type <: NamedTuple || col_type <: Union{NamedTuple, Missing}
keys_set = Set{Symbol}()
for item in df_copy[!, col]
union!(keys_set, keys(item))
if item !== missing
union!(keys_set, Symbol.(keys(item)))
end
end
for key in keys_set
new_col_name = names_sep === nothing ? key : Symbol(string(col, names_sep, key))
df_copy[!, new_col_name] =
[item !== missing ? get(item, key, missing) : missing for item in df_copy[!, col]]
end

elseif col_type <: Dict || any(x -> x isa Dict, df_copy[!, col])
# Perform shallow unnesting: extract only the outer keys.
flattened = Vector{Union{Missing, Dict{String,Any}}}(undef, nrow(df_copy))
for i in 1:nrow(df_copy)
val = df_copy[i, col]
if val === missing
flattened[i] = missing
elseif val isa Dict
# Leave the inner dictionaries intact; do not flatten further.
flattened[i] = val
else
flattened[i] = missing
end
end
keys_set = Set{String}()
for d in flattened
if d !== missing
union!(keys_set, keys(d))
end
end

for key in keys_set
new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key))
df_copy[!, new_col_name] = get.(df_copy[!, col], Ref(key), missing)
end
df_copy[!, new_col_name] = [d === missing ? missing : get(d, key, missing) for d in flattened]
end

elseif col_type <: Array
n = length(first(df_copy[!, col]))
for i in 1:n
new_col_name = names_sep === nothing ? Symbol(string(col, i)) : Symbol(string(col, names_sep, i))
try
df_copy[!, new_col_name] = getindex.(df_copy[!, col], i)
catch
throw("Try using `@unnest_longer($col)` before `@unnest_wider(attribute)`")
throw("Try using `@unnest_longer($col)` before `@unnest_wider($col)`") # COV_EXCL_LINE
end
end

elseif col_type <: Tuple || (col_type <: Union{Tuple, Missing})
nonmissing = filter(x -> x !== missing, df_copy[!, col])
n = length(first(nonmissing))
Expand All @@ -73,21 +87,10 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U
try
df_copy[!, new_col_name] = getindex.(df_copy[!, col], i)
catch
throw("Error unnesting tuple from column $col. Try using `@unnest_longer($col)` before `@unnest_wider(attribute)`")
end
end

elseif any(x -> x isa Dict, df_copy[!, col])
keys_set = Set{String}()
for item in df_copy[!, col]
if item isa Dict
union!(keys_set, keys(item))
throw("Error unnesting tuple from column $col. Try using `@unnest_longer($col)` before `@unnest_wider(attribute)`") # COV_EXCL_LINE
end
end
for key in keys_set
new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key))
df_copy[!, new_col_name] = [item isa Dict ? get(item, key, missing) : missing for item in df_copy[!, col]]
end

elseif any(x -> x isa Pair, df_copy[!, col])
keys_set = Set{Any}()
for item in df_copy[!, col]
Expand All @@ -97,18 +100,22 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U
end
for key in keys_set
new_col_name = names_sep === nothing ? Symbol(string(key)) : Symbol(string(col, names_sep, key))
df_copy[!, new_col_name] = [item isa Pair && item.first == key ? item.second : missing for item in df_copy[!, col]]
df_copy[!, new_col_name] =
[item isa Pair && item.first == key ? item.second : missing for item in df_copy[!, col]]
end

else
error("Column $col contains neither dictionaries nor arrays nor DataFrames")
error("Column $col contains neither dictionaries nor arrays nor DataFrames") # COV_EXCL_LINE
end

# Remove the original nested column.
select!(df_copy, Not(col))
end

if is_grouped
df_copy = groupby(df_copy, grouping_columns)
end

if log[]
@info generate_log(df, df_copy, "@unnest_wider", [:colchange])
end
Expand Down Expand Up @@ -138,8 +145,6 @@ macro unnest_wider(df, exprs...)
return df_expr
end

using DataFrames

function unnest_longer(df::Union{DataFrame, GroupedDataFrame}, cols; indices_include::Union{Nothing, Bool}=nothing, keep_empty::Bool=false)
is_grouped = df isa GroupedDataFrame
grouping_columns = is_grouped ? groupcols(df) : Symbol[]
Expand Down Expand Up @@ -241,7 +246,7 @@ function nest_pairs(df; kwargs...)
start_idx = findfirst(==(start_col), names(df))
end_idx = findfirst(==(end_col), names(df))
if isnothing(start_idx) || isnothing(end_idx)
throw(ArgumentError("Column range $cols is invalid"))
throw(ArgumentError("Column range $cols is invalid")) # COV_EXCL_LINE
end
cols = names(df)[start_idx:end_idx]
elseif isa(cols, Symbol)
Expand Down
Loading