Skip to content

Commit 4b68243

Browse files
committed
feat(display): support nested STRUCT and ARRAY data in interactive tables
1 parent 19e2c4f commit 4b68243

File tree

6 files changed

+745
-206
lines changed

6 files changed

+745
-206
lines changed

bigframes/display/_flatten.py

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Utilities for flattening nested data structures for display."""
16+
17+
from __future__ import annotations
18+
19+
from typing import cast
20+
21+
import pandas as pd
22+
import pyarrow as pa
23+
24+
25+
def flatten_nested_data(
26+
dataframe: pd.DataFrame,
27+
) -> tuple[pd.DataFrame, dict[str, list[int]], list[str], set[str]]:
28+
"""Flatten nested STRUCT and ARRAY columns for display."""
29+
if dataframe.empty:
30+
return dataframe.copy(), {}, [], set()
31+
32+
result_df = dataframe.copy()
33+
34+
(
35+
struct_columns,
36+
array_columns,
37+
array_of_struct_columns,
38+
clear_on_continuation_cols,
39+
nested_originated_columns,
40+
) = _classify_columns(result_df)
41+
42+
result_df, array_columns = _flatten_array_of_struct_columns(
43+
result_df, array_of_struct_columns, array_columns, nested_originated_columns
44+
)
45+
46+
result_df, clear_on_continuation_cols = _flatten_struct_columns(
47+
result_df, struct_columns, clear_on_continuation_cols, nested_originated_columns
48+
)
49+
50+
# Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT)
51+
if not array_columns:
52+
return (
53+
result_df,
54+
{},
55+
clear_on_continuation_cols,
56+
nested_originated_columns,
57+
)
58+
59+
result_df, array_row_groups = _explode_array_columns(result_df, array_columns)
60+
return (
61+
result_df,
62+
array_row_groups,
63+
clear_on_continuation_cols,
64+
nested_originated_columns,
65+
)
66+
67+
68+
def _classify_columns(
69+
dataframe: pd.DataFrame,
70+
) -> tuple[list[str], list[str], list[str], list[str], set[str]]:
71+
"""Identify all STRUCT and ARRAY columns."""
72+
initial_columns = list(dataframe.columns)
73+
struct_columns: list[str] = []
74+
array_columns: list[str] = []
75+
array_of_struct_columns: list[str] = []
76+
clear_on_continuation_cols: list[str] = []
77+
nested_originated_columns: set[str] = set()
78+
79+
for col_name_raw, col_data in dataframe.items():
80+
col_name = str(col_name_raw)
81+
dtype = col_data.dtype
82+
if isinstance(dtype, pd.ArrowDtype):
83+
pa_type = dtype.pyarrow_dtype
84+
if pa.types.is_struct(pa_type):
85+
struct_columns.append(col_name)
86+
nested_originated_columns.add(col_name)
87+
elif pa.types.is_list(pa_type):
88+
array_columns.append(col_name)
89+
nested_originated_columns.add(col_name)
90+
if hasattr(pa_type, "value_type") and (
91+
pa.types.is_struct(pa_type.value_type)
92+
):
93+
array_of_struct_columns.append(col_name)
94+
else:
95+
clear_on_continuation_cols.append(col_name)
96+
elif col_name in initial_columns:
97+
clear_on_continuation_cols.append(col_name)
98+
return (
99+
struct_columns,
100+
array_columns,
101+
array_of_struct_columns,
102+
clear_on_continuation_cols,
103+
nested_originated_columns,
104+
)
105+
106+
107+
def _flatten_array_of_struct_columns(
108+
dataframe: pd.DataFrame,
109+
array_of_struct_columns: list[str],
110+
array_columns: list[str],
111+
nested_originated_columns: set[str],
112+
) -> tuple[pd.DataFrame, list[str]]:
113+
"""Flatten ARRAY of STRUCT columns into separate array columns for each field."""
114+
result_df = dataframe.copy()
115+
for col_name in array_of_struct_columns:
116+
col_data = result_df[col_name]
117+
pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype
118+
struct_type = pa_type.value_type
119+
120+
# Use PyArrow to reshape the list<struct> into multiple list<field> arrays
121+
arrow_array = pa.array(col_data)
122+
offsets = arrow_array.offsets
123+
values = arrow_array.values # StructArray
124+
flattened_fields = values.flatten() # List[Array]
125+
126+
new_cols_to_add = {}
127+
new_array_col_names = []
128+
129+
# Create new columns for each struct field
130+
for field_idx in range(struct_type.num_fields):
131+
field = struct_type.field(field_idx)
132+
new_col_name = f"{col_name}.{field.name}"
133+
nested_originated_columns.add(new_col_name)
134+
new_array_col_names.append(new_col_name)
135+
136+
# Reconstruct ListArray for this field
137+
# Use mask=arrow_array.is_null() to preserve nulls from the original list
138+
new_list_array = pa.ListArray.from_arrays(
139+
offsets, flattened_fields[field_idx], mask=arrow_array.is_null()
140+
)
141+
142+
new_cols_to_add[new_col_name] = pd.Series(
143+
new_list_array.to_pylist(),
144+
dtype=pd.ArrowDtype(pa.list_(field.type)),
145+
index=result_df.index,
146+
)
147+
148+
col_idx = result_df.columns.to_list().index(col_name)
149+
new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index)
150+
151+
result_df = pd.concat(
152+
[
153+
result_df.iloc[:, :col_idx],
154+
new_cols_df,
155+
result_df.iloc[:, col_idx + 1 :],
156+
],
157+
axis=1,
158+
)
159+
160+
# Update array_columns list
161+
array_columns.remove(col_name)
162+
# Add the new array columns
163+
array_columns.extend(new_array_col_names)
164+
return result_df, array_columns
165+
166+
167+
def _explode_array_columns(
168+
dataframe: pd.DataFrame, array_columns: list[str]
169+
) -> tuple[pd.DataFrame, dict[str, list[int]]]:
170+
"""Explode array columns into new rows."""
171+
exploded_rows = []
172+
array_row_groups: dict[str, list[int]] = {}
173+
non_array_columns = dataframe.columns.drop(array_columns).tolist()
174+
non_array_df = dataframe[non_array_columns]
175+
176+
for orig_idx in dataframe.index:
177+
non_array_data = non_array_df.loc[orig_idx].to_dict()
178+
array_values = {}
179+
max_len_in_row = 0
180+
non_na_array_found = False
181+
182+
for col_name in array_columns:
183+
val = dataframe.loc[orig_idx, col_name]
184+
if val is not None and not (
185+
isinstance(val, list) and len(val) == 1 and pd.isna(val[0])
186+
):
187+
array_values[col_name] = list(val)
188+
max_len_in_row = max(max_len_in_row, len(val))
189+
non_na_array_found = True
190+
else:
191+
array_values[col_name] = []
192+
193+
if not non_na_array_found:
194+
new_row = non_array_data.copy()
195+
for col_name in array_columns:
196+
new_row[f"{col_name}"] = pd.NA
197+
exploded_rows.append(new_row)
198+
orig_key = str(orig_idx)
199+
if orig_key not in array_row_groups:
200+
array_row_groups[orig_key] = []
201+
array_row_groups[orig_key].append(len(exploded_rows) - 1)
202+
continue
203+
204+
# Create one row per array element, up to max_len_in_row
205+
for array_idx in range(max_len_in_row):
206+
new_row = non_array_data.copy()
207+
208+
# Add the specific array element for this index
209+
for col_name in array_columns:
210+
if array_idx < len(array_values.get(col_name, [])):
211+
new_row[f"{col_name}"] = array_values[col_name][array_idx]
212+
else:
213+
new_row[f"{col_name}"] = pd.NA
214+
215+
exploded_rows.append(new_row)
216+
217+
# Track which rows belong to which original row
218+
orig_key = str(orig_idx)
219+
if orig_key not in array_row_groups:
220+
array_row_groups[orig_key] = []
221+
array_row_groups[orig_key].append(len(exploded_rows) - 1)
222+
223+
if exploded_rows:
224+
# Reconstruct the DataFrame to maintain original column order
225+
exploded_df = pd.DataFrame(exploded_rows)[dataframe.columns]
226+
for col in exploded_df.columns:
227+
# After explosion, object columns that are all-numeric (except for NAs)
228+
# should be converted to a numeric dtype for proper alignment.
229+
if exploded_df[col].dtype == "object":
230+
try:
231+
# Use nullable integer type to preserve integers
232+
exploded_df[col] = exploded_df[col].astype(pd.Int64Dtype())
233+
except (ValueError, TypeError):
234+
# Fallback for non-integer numerics
235+
try:
236+
exploded_df[col] = pd.to_numeric(exploded_df[col])
237+
except (ValueError, TypeError):
238+
# Keep as object if not numeric
239+
pass
240+
return exploded_df, array_row_groups
241+
else:
242+
return dataframe, array_row_groups
243+
244+
245+
def _flatten_struct_columns(
246+
dataframe: pd.DataFrame,
247+
struct_columns: list[str],
248+
clear_on_continuation_cols: list[str],
249+
nested_originated_columns: set[str],
250+
) -> tuple[pd.DataFrame, list[str]]:
251+
"""Flatten regular STRUCT columns."""
252+
result_df = dataframe.copy()
253+
for col_name in struct_columns:
254+
col_data = result_df[col_name]
255+
if isinstance(col_data.dtype, pd.ArrowDtype):
256+
pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype
257+
258+
# Use PyArrow to flatten the struct column without row iteration
259+
# combine_chunks() ensures we have a single array if it was chunked
260+
arrow_array = pa.array(col_data)
261+
flattened_fields = arrow_array.flatten()
262+
263+
new_cols_to_add = {}
264+
for field_idx in range(pa_type.num_fields):
265+
field = pa_type.field(field_idx)
266+
new_col_name = f"{col_name}.{field.name}"
267+
nested_originated_columns.add(new_col_name)
268+
clear_on_continuation_cols.append(new_col_name)
269+
270+
# Create a new Series from the flattened array
271+
new_cols_to_add[new_col_name] = pd.Series(
272+
flattened_fields[field_idx].to_pylist(),
273+
dtype=pd.ArrowDtype(field.type),
274+
index=result_df.index,
275+
)
276+
277+
col_idx = result_df.columns.to_list().index(col_name)
278+
new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index)
279+
result_df = pd.concat(
280+
[
281+
result_df.iloc[:, :col_idx],
282+
new_cols_df,
283+
result_df.iloc[:, col_idx + 1 :],
284+
],
285+
axis=1,
286+
)
287+
return result_df, clear_on_continuation_cols

0 commit comments

Comments
 (0)