Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 7 additions & 15 deletions src/awkward/operations/ak_to_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,22 +234,14 @@ def recurse(layout, row_arrays, col_names):
else:
columns = pandas.MultiIndex.from_tuples([col_names])

# Before filling, ensure dtype is wide enough for "nan" fill value. Need
# at least three characters / bytes for 'nan' or b'nan', respectively.
# Pandas can handle None in object arrays, so convert masked strings
# to object arrays with None for missing values instead of filling with "nan"
if numpy.ma.is_masked(column):
if np.issubdtype(column.dtype, np.str_):
char_width = column.dtype.itemsize // 4
if char_width < 3:
column = column.astype(np.dtype(("U", 3)))
elif np.issubdtype(column.dtype, np.bytes_):
byte_width = column.dtype.itemsize
if byte_width < 3:
column = column.astype(np.dtype(("S", 3)))
# Pandas can't handle masked strings
if np.issubdtype(column.dtype, np.str_):
column = numpy.ma.filled(column, "nan")
elif np.issubdtype(column.dtype, np.bytes_):
column = numpy.ma.filled(column, b"nan")
if np.issubdtype(column.dtype, np.str_) or np.issubdtype(
column.dtype, np.bytes_
):
# Convert masked string/bytestring arrays to object arrays with None
column = numpy.where(column.mask, None, column.data).astype(object)

if (
last_row_arrays is not None
Expand Down
2 changes: 1 addition & 1 deletion tests/test_0331_pandas_indexedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,5 +182,5 @@ def test_union_to_record():
df_unionarray2["z"].values, [np.nan, np.nan, 999, np.nan, np.nan]
)
np.testing.assert_array_equal(
df_unionarray2["values"].values, ["nan", "one", "nan", "two", "nan"]
df_unionarray2["values"].values, [None, "one", None, "two", None]
)
32 changes: 16 additions & 16 deletions tests/test_3692_to_dataframe_masked_string_dtype_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ def test_masked_string_array_with_option():
# Convert to dataframe - should not raise an error about dtype width
df = ak.operations.to_dataframe(array_str)

# Verify the conversion worked and "nan" appears for None values
# Verify the conversion worked and None values are preserved
assert df["values"].values[0] == "a"
assert df["values"].values[1] == "nan"
assert df["values"].values[1] is None
assert df["values"].values[2] == "c"
assert df["values"].values[3] == "d"
assert df["values"].values[4] == "e"
assert df["values"].values[5] == "nan"
assert df["values"].values[5] is None


def test_masked_bytestring_array_with_option():
Expand All @@ -35,13 +35,13 @@ def test_masked_bytestring_array_with_option():
# Convert to dataframe - should not raise an error about dtype width
df = ak.operations.to_dataframe(array_bytes)

# Verify the conversion worked and b"nan" appears for None values
# Verify the conversion worked and None values are preserved
assert df["values"].values[0] == b"x"
assert df["values"].values[1] == b"nan"
assert df["values"].values[1] is None
assert df["values"].values[2] == b"z"
assert df["values"].values[3] == b"a"
assert df["values"].values[4] == b"b"
assert df["values"].values[5] == b"nan"
assert df["values"].values[5] is None


def test_union_with_narrow_strings():
Expand All @@ -55,14 +55,14 @@ def test_union_with_narrow_strings():

# Verify conversion works without dtype errors
assert len(df) == 3
# The x column should have "nan" for missing values
assert df["x"].values[1] == "nan"
# The x column should have None for missing values
assert df["x"].values[1] is None


def test_single_char_strings_with_none():
"""Test very short strings with None values."""
# Single character strings with None values
# This is the edge case that needs dtype resizing
# This tests that even very short strings work correctly
array = ak.Array([["a", "b"], [None, "c"], ["d", None]])

df = ak.operations.to_dataframe(array)
Expand All @@ -73,8 +73,8 @@ def test_single_char_strings_with_none():
assert "b" in values
assert "c" in values
assert "d" in values
# Check that "nan" string appears for None values
assert values.count("nan") == 2
# Check that None appears for None values
assert values.count(None) == 2


def test_single_byte_bytestrings_with_none():
Expand All @@ -90,19 +90,19 @@ def test_single_byte_bytestrings_with_none():
assert b"b" in values
assert b"c" in values
assert b"d" in values
# Check that b"nan" appears for None values
assert values.count(b"nan") == 2
# Check that None appears for None values
assert values.count(None) == 2


def test_two_char_strings_with_none():
"""Test two-character strings with None values (edge case)."""
# Two character strings - exactly the edge case where "nan" (3 chars) won't fit
# Two character strings - tests that any string length works correctly
array = ak.Array([["ab", None], [None, "cd"]])

df = ak.operations.to_dataframe(array)

values = df["values"].values.tolist()
assert "ab" in values
assert "cd" in values
# "nan" should appear (3 characters)
assert values.count("nan") == 2
# None should appear for None values
assert values.count(None) == 2
74 changes: 74 additions & 0 deletions tests/test_3713_to_dataframe_none_vs_nan_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

import pytest

import awkward as ak

pandas = pytest.importorskip("pandas")


def test_none_vs_nan_string():
"""Test that None and 'nan' string are distinguishable in to_dataframe."""
# Create array with None and literal "nan" string
array = ak.Array([None, "nan"])
df = ak.to_dataframe(array)
result = df.to_dict()

# None should be None, not the string "nan"
assert result["values"][0] is None
assert result["values"][1] == "nan"
# They should be distinguishable
assert result["values"][0] != result["values"][1]


def test_none_vs_nan_bytestring():
"""Test that None and b'nan' bytestring are distinguishable in to_dataframe."""
# Create array with None and literal b"nan" bytestring
array = ak.Array([None, b"nan"])
df = ak.to_dataframe(array)
result = df.to_dict()

# None should be None, not the bytestring b"nan"
assert result["values"][0] is None
assert result["values"][1] == b"nan"
# They should be distinguishable
assert result["values"][0] != result["values"][1]


def test_nested_list_with_none_and_nan_string():
"""Test nested lists containing both None and 'nan' string."""
array = ak.Array([["a", None, "nan"], ["b", "nan", None]])
df = ak.to_dataframe(array)

# Check that None and "nan" are distinguishable
values = df["values"].values
assert values[0] == "a"
assert values[1] is None
assert values[2] == "nan"
assert values[3] == "b"
assert values[4] == "nan"
assert values[5] is None


def test_record_with_none_and_nan_string():
"""Test records containing both None and 'nan' string."""
array = ak.Array(
[
{"x": "value", "y": None},
{"x": "nan", "y": "another"},
{"x": None, "y": "nan"},
]
)
df = ak.to_dataframe(array)

# Check x column
assert df["x"].values[0] == "value"
assert df["x"].values[1] == "nan"
assert df["x"].values[2] is None

# Check y column
assert df["y"].values[0] is None
assert df["y"].values[1] == "another"
assert df["y"].values[2] == "nan"
Loading