Skip to content

Commit 39a4681

Browse files
authored
Improve error message for pack_flat with NaN values in index (#450)
1 parent 070f289 commit 39a4681

File tree

2 files changed

+64
-2
lines changed

2 files changed

+64
-2
lines changed

src/nested_pandas/series/packer.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,28 @@ def pack_flat(
9393
nested_pandas.series.dtype.NestedDtype : The dtype of the output series.
9494
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
9595
"""
96-
9796
if on is not None:
9897
df = df.set_index(on)
9998
# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
10099
sorted_flat = df.sort_index(kind="stable")
101-
return pack_sorted_df_into_struct(sorted_flat, name=name)
100+
try:
101+
return pack_sorted_df_into_struct(sorted_flat, name=name)
102+
except ValueError:
103+
# Check if the error is due to NaN values and raise a more informative message
104+
if any(sorted_flat.index.get_level_values(i).hasnans for i in range(sorted_flat.index.nlevels)):
105+
if on is None:
106+
raise ValueError(
107+
"The index contains NaN values. "
108+
"NaN values are not supported because they cannot be used for grouping rows. "
109+
"Please remove or fill NaN values before packing."
110+
) from None
111+
cols = [on] if isinstance(on, str) else list(on)
112+
raise ValueError(
113+
f"Column(s) {cols} contain NaN values. "
114+
"NaN values are not supported because they cannot be used for grouping rows. "
115+
"Please remove or fill NaN values before packing."
116+
) from None
117+
raise
102118

103119

104120
def pack_seq(

tests/nested_pandas/series/test_packer.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,3 +598,49 @@ def test_calculate_sorted_index_offsets_raises_when_not_sorted():
598598
index = pd.Index([1, 2, 1, 2, 3, 3, 4, 4, 4])
599599
with pytest.raises(ValueError):
600600
packer.calculate_sorted_index_offsets(index)
601+
602+
603+
@pytest.mark.parametrize(
604+
"index",
605+
[
606+
pd.Index([1.0, 1.0, 2.0, 2.0, np.nan]),
607+
pd.MultiIndex.from_arrays(([1, 1, 2, 2, 2], [1.0, 2.0, np.nan, 1.0, 2.0])),
608+
],
609+
)
610+
def test_pack_flat_raises_with_nan_in_index(index):
611+
"""Test pack_flat() raises informative error when index contains NaN values.
612+
613+
This is a regression test for https://github.com/lincc-frameworks/nested-pandas/issues/440
614+
"""
615+
df = pd.DataFrame(
616+
data={
617+
"a": [1, 2, 3, 4, 5],
618+
"b": [0, 1, 0, 1, 0],
619+
},
620+
index=index,
621+
)
622+
with pytest.raises(ValueError, match="The index contains NaN values"):
623+
packer.pack_flat(df)
624+
625+
626+
@pytest.mark.parametrize(
627+
"col_data,col_dtype",
628+
[
629+
([1.0, 1.0, 2.0, 2.0, np.nan], None),
630+
([1.0, 1.0, 2.0, 2.0, None], pd.ArrowDtype(pa.float64())),
631+
],
632+
)
633+
def test_pack_flat_raises_with_nan_in_on_column(col_data, col_dtype):
634+
"""Test pack_flat() raises informative error when 'on' column contains NaN values.
635+
636+
This is a regression test for https://github.com/lincc-frameworks/nested-pandas/issues/440
637+
"""
638+
df = pd.DataFrame(
639+
data={
640+
"a": [1, 2, 3, 4, 5],
641+
"b": [0, 1, 0, 1, 0],
642+
"c": pd.array(col_data, dtype=col_dtype),
643+
},
644+
)
645+
with pytest.raises(ValueError, match=r"Column\(s\) \['c'\] contain NaN values"):
646+
packer.pack_flat(df, on="c")

0 commit comments

Comments
 (0)