Skip to content

Commit 4b44b75

Browse files
author
Ivan Silantyev
committed
fix json_normalize with meta far different from record_path
fix of json_normalize when record_path long and meta path far different from record_path
1 parent d966462 commit 4b44b75

File tree

2 files changed

+59
-7
lines changed

2 files changed

+59
-7
lines changed

pandas/io/json/_normalize.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -533,19 +533,33 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
533533
meta_vals: DefaultDict = defaultdict(list)
534534
meta_keys = [sep.join(val) for val in _meta]
535535

536-
def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
536+
def _recursive_meta_extract(data, meta_remaining_path, meta_key, seen_meta) -> None:
537+
if isinstance(data, dict):
538+
if len(meta_remaining_path) > 1:
539+
if meta_remaining_path[0] in data:
540+
_recursive_meta_extract(data[meta_remaining_path[0]], meta_remaining_path[1:], meta_key, seen_meta)
541+
else:
542+
if errors == "ignore":
543+
seen_meta[meta_key] = np.nan
544+
raise KeyError(f"SubKey {meta_remaining_path[0]} of key {meta_key} not found in meta data.")
545+
else:
546+
seen_meta[meta_key] = _pull_field(data, meta_remaining_path[0])
547+
548+
def _recursive_extract(data, passed_path, remaining_path, seen_meta, level: int = 0) -> None:
537549
if isinstance(data, dict):
538550
data = [data]
539-
if len(path) > 1:
551+
if len(remaining_path) > 1:
540552
for obj in data:
541553
for val, key in zip(_meta, meta_keys):
542-
if level + 1 == len(val):
554+
if level + 1 == len(val) and passed_path == val[:-1]:
543555
seen_meta[key] = _pull_field(obj, val[-1])
556+
elif level + 1 < len(val) and passed_path == val[:level] and remaining_path[0] != val[level]:
557+
_recursive_meta_extract(obj, val[level:], key, seen_meta)
544558

545-
_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
559+
_recursive_extract(obj[remaining_path[0]], passed_path + remaining_path[:1], remaining_path[1:], seen_meta, level=level + 1)
546560
else:
547561
for obj in data:
548-
recs = _pull_records(obj, path[0])
562+
recs = _pull_records(obj, remaining_path[0])
549563
recs = [
550564
nested_to_record(r, sep=sep, max_level=max_level)
551565
if isinstance(r, dict)
@@ -556,14 +570,14 @@ def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
556570
# For repeating the metadata later
557571
lengths.append(len(recs))
558572
for val, key in zip(_meta, meta_keys):
559-
if level + 1 > len(val):
573+
if key in seen_meta:
560574
meta_val = seen_meta[key]
561575
else:
562576
meta_val = _pull_field(obj, val[level:])
563577
meta_vals[key].append(meta_val)
564578
records.extend(recs)
565579

566-
_recursive_extract(data, record_path, {}, level=0)
580+
_recursive_extract(data, [], record_path, {}, level=0)
567581

568582
result = DataFrame(records)
569583

pandas/tests/io/json/test_normalize.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,44 @@ def test_shallow_nested(self):
301301
expected = DataFrame(ex_data, columns=result.columns)
302302
tm.assert_frame_equal(result, expected)
303303

304+
def c(self):
305+
data = [
306+
{
307+
"state": "Florida",
308+
"shortname": "FL",
309+
"info": {"governor": "Rick Scott"},
310+
"details": {"counties": [
311+
{"name": "Dade", "population": 12345},
312+
{"name": "Broward", "population": 40000},
313+
{"name": "Palm Beach", "population": 60000},
314+
]
315+
},
316+
},
317+
{
318+
"state": "Ohio",
319+
"shortname": "OH",
320+
"info": {"governor": "John Kasich"},
321+
"details": {"counties": [
322+
{"name": "Summit", "population": 1234},
323+
{"name": "Cuyahoga", "population": 1337},
324+
]
325+
},
326+
},
327+
]
328+
329+
result = json_normalize(
330+
data, ["details", "counties"], ["state", "shortname", ["info", "governor"]]
331+
)
332+
ex_data = {
333+
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
334+
"state": ["Florida"] * 3 + ["Ohio"] * 2,
335+
"shortname": ["FL", "FL", "FL", "OH", "OH"],
336+
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
337+
"population": [12345, 40000, 60000, 1234, 1337],
338+
}
339+
expected = DataFrame(ex_data, columns=result.columns)
340+
tm.assert_frame_equal(result, expected)
341+
304342
def test_nested_meta_path_with_nested_record_path(self, state_data):
305343
# GH 27220
306344
result = json_normalize(

0 commit comments

Comments
 (0)