|
9 | 9 | from collections import defaultdict |
10 | 10 | from dataclasses import dataclass |
11 | 11 | from pathlib import Path |
12 | | -from typing import DefaultDict, Dict, Iterable, List |
| 12 | +from typing import Any, DefaultDict, Dict, Iterable, List |
13 | 13 |
|
14 | 14 | import click |
15 | 15 | import pandas as pd |
@@ -40,10 +40,62 @@ def path(self) -> Path: |
40 | 40 | return self.partition_dir / self.filename |
41 | 41 |
|
42 | 42 |
|
| 43 | +def _is_expandable(value: Any) -> bool: |
| 44 | + """Return ``True`` if *value* should be expanded into scalar columns.""" |
| 45 | + |
| 46 | + return isinstance(value, (dict, list, tuple)) |
| 47 | + |
| 48 | + |
| 49 | +def _flatten_nested(value: Any) -> Any: |
| 50 | + """Recursively convert nested *value* into a dict keyed by indices.""" |
| 51 | + |
| 52 | + if isinstance(value, dict): |
| 53 | + return {key: _flatten_nested(val) for key, val in value.items()} |
| 54 | + |
| 55 | + if isinstance(value, (list, tuple)): |
| 56 | + return {str(idx): _flatten_nested(val) for idx, val in enumerate(value)} |
| 57 | + |
| 58 | + return value |
| 59 | + |
| 60 | + |
| 61 | +def _expand_nested_columns(frame: pd.DataFrame) -> pd.DataFrame: |
| 62 | + """Expand list- or dict-typed columns in *frame* into scalar columns.""" |
| 63 | + |
| 64 | + for column in list(frame.columns): |
| 65 | + series = frame[column] |
| 66 | + mask = series.apply(_is_expandable) |
| 67 | + |
| 68 | + if not mask.any(): |
| 69 | + continue |
| 70 | + |
| 71 | + prepared_rows = [ |
| 72 | + _flatten_nested(value) if expand else {} |
| 73 | + for value, expand in zip(series.tolist(), mask.tolist()) |
| 74 | + ] |
| 75 | + expanded = pd.json_normalize(prepared_rows, sep=".") |
| 76 | + |
| 77 | + if not expanded.empty: |
| 78 | + expanded.index = series.index |
| 79 | + expanded = expanded.add_prefix(f"{column}.") |
| 80 | + frame = frame.join(expanded) |
| 81 | + |
| 82 | + if mask.all(): |
| 83 | + frame = frame.drop(columns=[column]) |
| 84 | + else: |
| 85 | + frame.loc[mask, column] = None |
| 86 | + |
| 87 | + return frame |
| 88 | + |
| 89 | + |
43 | 90 | def _normalize_records(records: Iterable[Dict]) -> pd.DataFrame: |
44 | 91 | """Return a flattened dataframe for *records*.""" |
45 | 92 |
|
46 | | - return pd.json_normalize(list(records), sep=".") |
| 93 | + frame = pd.json_normalize(list(records), sep=".") |
| 94 | + |
| 95 | + if frame.empty: |
| 96 | + return frame |
| 97 | + |
| 98 | + return _expand_nested_columns(frame) |
47 | 99 |
|
48 | 100 |
|
49 | 101 | def _partition_target( |
|
0 commit comments