fixes unnest_wider bug w missing key (#143)

drizk1 · rdboyes · kdpsingh · web-flow · commit 1163b981216c · 2025-03-24T01:41:17.000-07:00
* fixes unnest_wider bug w missing key * address an edge case, improves stacktrace with throw * adds test to get code cov up for nest.jl * makes names when unnesting wider more explicit so that unnesting seomthing w name that already exists doenst overwrite that name * fix json example * fix json part2 * remove json example for now * remove json from docs toml * copy and paste og nesting docs * learn to read, add json ex change default for nesting name * learn to scroll * fixes dif length arrays with longer * groupby nest test 4 codecov * unnest tuples wider support * basic logging for main verbs (#138) * basic logging for select, mutate, and transmute * unit testing for logs * remove deepdiffs dependency * adds tests for logs on the rest of the functions * typo fix * add mutate numbers to log * adds join logging, fix cov x * Fix esedge case for logging with grouped data frames. * :newsize mode logs correct type * add detail for row_change and col_change * add brief docs, bump v, up news * fixes log when grouped mutate, adds fillmissing, dropmissing log support * fixed fxn call * fix join log if stmnt, bump cov attempt w 2tests * add slice log support * change slice_min_max to not use`@filter` bc of logging msg dupes * adds unite, sep, sep_rows * adds logging for nests * minor docs edits for settings * exclude log.jl from code coverage for now --------- Co-authored-by: Daniel Rizk <rizkytennis@gmail.com> Co-authored-by: Karandeep Singh <karandeep@gmail.com> * fixes count n issue (#145) * fixes count n issue * gets rid of xs lines in conversion to improve testing * revert type converts, add tests * basic logging for main verbs (#138) * basic logging for select, mutate, and transmute * unit testing for logs * remove deepdiffs dependency * adds tests for logs on the rest of the functions * typo fix * add mutate numbers to log * adds join logging, fix cov x * Fix esedge case for logging with grouped data frames. * :newsize mode logs correct type * add detail for row_change and col_change * add brief docs, bump v, up news * fixes log when grouped mutate, adds fillmissing, dropmissing log support * fixed fxn call * fix join log if stmnt, bump cov attempt w 2tests * add slice log support * change slice_min_max to not use`@filter` bc of logging msg dupes * adds unite, sep, sep_rows * adds logging for nests * minor docs edits for settings * exclude log.jl from code coverage for now --------- Co-authored-by: Daniel Rizk <rizkytennis@gmail.com> Co-authored-by: Karandeep Singh <karandeep@gmail.com> * Updated NEWS.md * Set `fail-on-error` to false for coveralls, removed excluded coverage from log.jl. --------- Co-authored-by: Randall Boyes <33524191+rdboyes@users.noreply.github.com> Co-authored-by: Karandeep Singh <karandeep@gmail.com> * Updated NEWS.md --------- Co-authored-by: Randall Boyes <33524191+rdboyes@users.noreply.github.com> Co-authored-by: Karandeep Singh <karandeep@gmail.com>
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,7 @@
 
 ## v.0.17.0 - 2025-03-24
 - Bugfix: `@count()` can now be called multiple times. If column `n` already exists, then the new column containing the count will be `nn` (and so on).
+- Bugfix: `@unnest_wider()` now works on data where keys are missing
 - Adds logging ability to track changes to data frames with `TidierData_set("log", true)`
 - Adds docs describing logging and code printing
 
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -11,3 +11,4 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 TidierData = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
diff --git a/docs/examples/UserGuide/nesting.jl b/docs/examples/UserGuide/nesting.jl
@@ -11,15 +11,15 @@ nested_df = @nest(df4, n2 = starts_with("a"), n3 = y:yz)
 # To return to the original dataframe, you can unnest wider and then longer.
 
 @chain nested_df begin
-    @unnest_wider(n3:n2)
+    @unnest_wider(n3:n2, names_sep = nothing)
     @unnest_longer(y:ab)
 end
 
 # Or you can unnest longer and then wider.
 
 @chain nested_df begin
   @unnest_longer(n3:n2)
-  @unnest_wider(n3:n2)
+  @unnest_wider(n3:n2, names_sep = nothing)
 end
 
 # ## `@unnest_longer`
@@ -67,5 +67,35 @@ df3 = DataFrame(
 
 @chain df3 begin 
     @unnest_wider(y)
-    @unnest_longer(a:c, keep_empty = true)
+    @unnest_longer(y_a:y_c, keep_empty = true)
+end
+
+# ## unnest JSON files 
+
+using JSON
+
+json_str = """
+       {
+           "name": "Chris",
+           "age": 23,
+           "address": {
+               "city": "New York",
+               "country": "America"
+           },
+           "friends": [
+               {
+                   "name": "Emily",
+                   "hobbies": [ "biking", "music", "gaming" ]
+               },
+               {
+                   "name": "John",
+                   "hobbies": [ "soccer", "gaming" ]
+               }
+           ]
+       }
+       """;
+json_df = DataFrame(JSON.parse(json_str))
+
+@chain json_df begin
+       @unnest_wider(address, friends)
 end
diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -3223,7 +3223,7 @@ Unnest specified columns of arrays or dictionaries into wider format dataframe w
 # Arguments
 - `df`: A DataFrame.
 - `columns`: Columns to be unnested. These columns should contain arrays, dictionaries, dataframes, or tuples. Dictionarys headings will be converted to column names.
-- `names_sep`: An optional string to specify the separator for creating new column names. If not provided, defaults to no separator.
+- `names_sep`: An optional string to specify the separator for creating new column names. If not provided, defaults to `_`.
 
 # Examples
 ```jldoctest
@@ -3233,11 +3233,11 @@ julia> df = DataFrame(name = ["Zaki", "Farida"], attributes = [
 
 julia> @unnest_wider(df, attributes)
 2×3 DataFrame
- Row │ name    city         age   
-     │ String  String       Int64 
-─────┼────────────────────────────
-   1 │ Zaki    New York        25
-   2 │ Farida  Los Angeles     30
+ Row │ name    attributes_city  attributes_age 
+     │ String  String           Int64          
+─────┼─────────────────────────────────────────
+   1 │ Zaki    New York                     25
+   2 │ Farida  Los Angeles                  30
 
 julia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
 2×3 DataFrame
@@ -3247,13 +3247,54 @@ julia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
    1 │     1  [1, 2]  [5, 6]
    2 │     2  [3, 4]  [7, 8]
 
-julia> @unnest_wider(df2, b:c, names_sep = "_")
+julia> @unnest_wider(df2, b:c, names_sep = "")
 2×5 DataFrame
- Row │ a      b_1    b_2    c_1    c_2   
+ Row │ a      b1     b2     c1     c2    
      │ Int64  Int64  Int64  Int64  Int64 
 ─────┼───────────────────────────────────
    1 │     1      1      2      5      6
    2 │     2      3      4      7      8
+
+
+julia> a1=Dict("a"=>1, "b"=>Dict("c"=>1, "d"=>2)); a2=Dict("a"=>1, "b"=>Dict("c"=>1)); a=[a1;a2]; df=DataFrame(a);
+
+julia> @unnest_wider(df, b)
+2×3 DataFrame
+ Row │ a      b_c    b_d     
+     │ Int64  Int64  Int64?  
+─────┼───────────────────────
+   1 │     1      1        2
+   2 │     1      1  missing 
+
+julia> a0=Dict("a"=>0, "b"=>0);  a1=Dict("a"=>1, "b"=>Dict("c"=>1, "d"=>2)); a2=Dict("a"=>2, "b"=>Dict("c"=>2)); a3=Dict("a"=>3, "b"=>Dict("c"=>3)); a=[a0;a1;a2;a3]; df3=DataFrame(a);
+
+julia> @unnest_wider(df3, b)
+4×3 DataFrame
+ Row │ a      b_c      b_d     
+     │ Int64  Int64?   Int64?  
+─────┼─────────────────────────
+   1 │     0  missing  missing 
+   2 │     1        1        2
+   3 │     2        2  missing 
+   4 │     3        3  missing 
+
+julia> df = DataFrame(x1 = ["one", "two", "three"], x2 = [(1, "a"), (2, "b"), (3, "c")])
+3×2 DataFrame
+ Row │ x1      x2       
+     │ String  Tuple…   
+─────┼──────────────────
+   1 │ one     (1, "a")
+   2 │ two     (2, "b")
+   3 │ three   (3, "c")
+
+julia> @unnest_wider df x2
+3×3 DataFrame
+ Row │ x1      x2_1   x2_2   
+     │ String  Int64  String 
+─────┼───────────────────────
+   1 │ one         1  a
+   2 │ two         2  b
+   3 │ three       3  c
 ```
 """
 
@@ -3388,7 +3429,7 @@ julia> @chain df begin
 
 julia> @chain df begin
          @nest(data = b:c_2)
-         @unnest_wider(data)
+         @unnest_wider(data, names_sep = nothing)
        end
 5×4 DataFrame
  Row │ a     b             c_1           c_2          
@@ -3402,7 +3443,7 @@ julia> @chain df begin
 
 julia> @chain df begin
          @nest(data = -a)
-         @unnest_wider(data) # wider first
+         @unnest_wider(data, names_sep = nothing) # wider first
          @unnest_longer(-a)  # then longer
        end
 15×4 DataFrame
@@ -3428,27 +3469,38 @@ julia> @chain df begin
 julia> @chain df begin
          @nest(data = -a)
          @unnest_longer(data) # longer first
-         @unnest_wider(-a)    # then wider
+         @unnest_wider(-a)    # then wider, names sep defualting to "_"
        end
 15×4 DataFrame
- Row │ a     b      c_2    c_1   
-     │ Char  Int64  Int64  Int64 
-─────┼───────────────────────────
-   1 │ a         1     31     16
-   2 │ a         2     32     17
-   3 │ a         3     33     18
-   4 │ b         4     34     19
-   5 │ b         5     35     20
-   6 │ b         6     36     21
-   7 │ c         7     37     22
-   8 │ c         8     38     23
-   9 │ c         9     39     24
-  10 │ d        10     40     25
-  11 │ d        11     41     26
-  12 │ d        12     42     27
-  13 │ e        13     43     28
-  14 │ e        14     44     29
-  15 │ e        15     45     30
+ Row │ a     data_b  data_c_2  data_c_1 
+     │ Char  Int64   Int64     Int64    
+─────┼──────────────────────────────────
+   1 │ a          1        31        16
+   2 │ a          2        32        17
+   3 │ a          3        33        18
+   4 │ b          4        34        19
+   5 │ b          5        35        20
+   6 │ b          6        36        21
+   7 │ c          7        37        22
+   8 │ c          8        38        23
+   9 │ c          9        39        24
+  10 │ d         10        40        25
+  11 │ d         11        41        26
+  12 │ d         12        42        27
+  13 │ e         13        43        28
+  14 │ e         14        44        29
+  15 │ e         15        45        30
+
+julia> @chain df @group_by(a) @nest(data = b:c_2) @ungroup()
+5×2 DataFrame
+ Row │ a     data          
+     │ Char  DataFrame     
+─────┼─────────────────────
+   1 │ a     3×3 DataFrame 
+   2 │ b     3×3 DataFrame 
+   3 │ c     3×3 DataFrame 
+   4 │ d     3×3 DataFrame 
+   5 │ e     3×3 DataFrame 
 ```
 """
 
diff --git a/src/nests.jl b/src/nests.jl
@@ -1,4 +1,4 @@
-function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::Union{String, Nothing}=nothing)
+function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::Union{String, Nothing}="_")
     is_grouped = df isa GroupedDataFrame
     grouping_columns = is_grouped ? groupcols(df) : Symbol[]
     df_copy = copy(is_grouped ? parent(df) : df)
@@ -49,19 +49,56 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U
             for item in df_copy[!, col]
                 union!(keys_set, keys(item))
             end
-  
+        
             for key in keys_set
                 new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key))
-                df_copy[!, new_col_name] = getindex.(df_copy[!, col], key)
-            end
+                df_copy[!, new_col_name] = get.(df_copy[!, col], Ref(key), missing)
+            end        
   
         elseif col_type <: Array
             n = length(first(df_copy[!, col]))
             for i in 1:n
                 new_col_name = names_sep === nothing ? Symbol(string(col, i)) : Symbol(string(col, names_sep, i))
-                df_copy[!, new_col_name] = getindex.(df_copy[!, col], i)
+                try 
+                    df_copy[!, new_col_name] = getindex.(df_copy[!, col], i)
+                catch
+                    throw("Try using `@unnest_longer($col)` before `@unnest_wider(attribute)`")
+                end
+            end
+        elseif col_type <: Tuple || (col_type <: Union{Tuple, Missing})
+            nonmissing = filter(x -> x !== missing, df_copy[!, col])
+            n = length(first(nonmissing))
+            for i in 1:n
+                new_col_name = names_sep === nothing ? Symbol(string(col, i)) : Symbol(string(col, names_sep, i))
+                try 
+                    df_copy[!, new_col_name] = getindex.(df_copy[!, col], i)
+                catch
+                    throw("Error unnesting tuple from column $col. Try using `@unnest_longer($col)` before `@unnest_wider(attribute)`")
+                end
+            end
+        
+        elseif any(x -> x isa Dict, df_copy[!, col])
+            keys_set = Set{String}()
+            for item in df_copy[!, col]
+                if item isa Dict
+                    union!(keys_set, keys(item))
+                end
+            end
+            for key in keys_set
+                new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key))
+                df_copy[!, new_col_name] = [item isa Dict ? get(item, key, missing) : missing for item in df_copy[!, col]]
+            end
+        elseif any(x -> x isa Pair, df_copy[!, col])
+            keys_set = Set{Any}()
+            for item in df_copy[!, col]
+                if item isa Pair
+                    push!(keys_set, item.first)
+                end
+            end
+            for key in keys_set
+                new_col_name = names_sep === nothing ? Symbol(string(key)) : Symbol(string(col, names_sep, key))
+                df_copy[!, new_col_name] = [item isa Pair && item.first == key ? item.second : missing for item in df_copy[!, col]]
             end
-  
         else
             error("Column $col contains neither dictionaries nor arrays nor DataFrames")
         end
@@ -78,13 +115,14 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U
     return df_copy
 end
 
+
 """
 $docstring_unnest_wider
 """
 macro unnest_wider(df, exprs...)
   exprs = parse_blocks(exprs...)
 
-  names_sep = :(nothing) 
+  names_sep = :("_") 
   if length(exprs) >= 2 && isa(exprs[end], Expr) && exprs[end].head == :(=) && exprs[end].args[1] == :names_sep
     names_sep = esc(exprs[end].args[2]) 
     exprs = exprs[1:end-1] 
@@ -100,6 +138,8 @@ macro unnest_wider(df, exprs...)
   return df_expr
 end
 
+using DataFrames
+
 function unnest_longer(df::Union{DataFrame, GroupedDataFrame}, cols; indices_include::Union{Nothing, Bool}=nothing, keep_empty::Bool=false)
     is_grouped = df isa GroupedDataFrame
     grouping_columns = is_grouped ? groupcols(df) : Symbol[]
@@ -116,10 +156,28 @@ function unnest_longer(df::Union{DataFrame, GroupedDataFrame}, cols; indices_inc
                            x for x in df_copy[!, col]]
     end
   
+    # Pad rows if columns have different lengths.
+    for i in 1:nrow(df_copy)
+        # Collect lengths of each non-missing iterable in this row
+        current_lengths = [length(df_copy[i, col]) for col in column_symbols if !ismissing(df_copy[i, col])]
+        if !isempty(current_lengths)
+            maxlen = maximum(current_lengths)
+            for col in column_symbols
+                if !ismissing(df_copy[i, col])
+                    arr = df_copy[i, col]
+                    if length(arr) < maxlen
+                        df_copy[i, col] = vcat(arr, fill(missing, maxlen - length(arr)))
+                    end
+                end
+            end
+        end
+    end
+  
     # Apply filter if keep_empty is false
     if !keep_empty
       df_copy = filter(row -> !any(ismissing, [row[col] for col in column_symbols]), df_copy)
     end
+  
     # Flatten the dataframe
     flattened_df = flatten(df_copy, column_symbols)
   
@@ -139,6 +197,7 @@ function unnest_longer(df::Union{DataFrame, GroupedDataFrame}, cols; indices_inc
     end
     return flattened_df
 end
+
   
 """
 $docstring_unnest_longer
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,7 +8,7 @@ DocMeta.setdocmeta!(TidierData, :DocTestSetup, :(using TidierData); recursive=tr
 
 doctest(TidierData)
 
-end
+end 
 
 using TidierData
 using Test