diff --git a/src/awkward/operations/ak_to_dataframe.py b/src/awkward/operations/ak_to_dataframe.py index 46e4206ad5..27d489d5da 100644 --- a/src/awkward/operations/ak_to_dataframe.py +++ b/src/awkward/operations/ak_to_dataframe.py @@ -234,22 +234,14 @@ def recurse(layout, row_arrays, col_names): else: columns = pandas.MultiIndex.from_tuples([col_names]) - # Before filling, ensure dtype is wide enough for "nan" fill value. Need - # at least three characters / bytes for 'nan' or b'nan', respectively. + # Pandas can handle None in object arrays, so convert masked strings + # to object arrays with None for missing values instead of filling with "nan" if numpy.ma.is_masked(column): - if np.issubdtype(column.dtype, np.str_): - char_width = column.dtype.itemsize // 4 - if char_width < 3: - column = column.astype(np.dtype(("U", 3))) - elif np.issubdtype(column.dtype, np.bytes_): - byte_width = column.dtype.itemsize - if byte_width < 3: - column = column.astype(np.dtype(("S", 3))) - # Pandas can't handle masked strings - if np.issubdtype(column.dtype, np.str_): - column = numpy.ma.filled(column, "nan") - elif np.issubdtype(column.dtype, np.bytes_): - column = numpy.ma.filled(column, b"nan") + if np.issubdtype(column.dtype, np.str_) or np.issubdtype( + column.dtype, np.bytes_ + ): + # Convert masked string/bytestring arrays to object arrays with None + column = numpy.where(column.mask, None, column.data).astype(object) if ( last_row_arrays is not None diff --git a/tests/test_0331_pandas_indexedarray.py b/tests/test_0331_pandas_indexedarray.py index a92bdbb890..23acbfa046 100644 --- a/tests/test_0331_pandas_indexedarray.py +++ b/tests/test_0331_pandas_indexedarray.py @@ -182,5 +182,5 @@ def test_union_to_record(): df_unionarray2["z"].values, [np.nan, np.nan, 999, np.nan, np.nan] ) np.testing.assert_array_equal( - df_unionarray2["values"].values, ["nan", "one", "nan", "two", "nan"] + df_unionarray2["values"].values, [None, "one", None, "two", None] ) diff --git a/tests/test_3692_to_dataframe_masked_string_dtype_resize.py b/tests/test_3692_to_dataframe_masked_string_dtype_resize.py index 863f6bebf0..e6a0a7642c 100644 --- a/tests/test_3692_to_dataframe_masked_string_dtype_resize.py +++ b/tests/test_3692_to_dataframe_masked_string_dtype_resize.py @@ -18,13 +18,13 @@ def test_masked_string_array_with_option(): # Convert to dataframe - should not raise an error about dtype width df = ak.operations.to_dataframe(array_str) - # Verify the conversion worked and "nan" appears for None values + # Verify the conversion worked and None values are preserved assert df["values"].values[0] == "a" - assert df["values"].values[1] == "nan" + assert df["values"].values[1] is None assert df["values"].values[2] == "c" assert df["values"].values[3] == "d" assert df["values"].values[4] == "e" - assert df["values"].values[5] == "nan" + assert df["values"].values[5] is None def test_masked_bytestring_array_with_option(): @@ -35,13 +35,13 @@ def test_masked_bytestring_array_with_option(): # Convert to dataframe - should not raise an error about dtype width df = ak.operations.to_dataframe(array_bytes) - # Verify the conversion worked and b"nan" appears for None values + # Verify the conversion worked and None values are preserved assert df["values"].values[0] == b"x" - assert df["values"].values[1] == b"nan" + assert df["values"].values[1] is None assert df["values"].values[2] == b"z" assert df["values"].values[3] == b"a" assert df["values"].values[4] == b"b" - assert df["values"].values[5] == b"nan" + assert df["values"].values[5] is None def test_union_with_narrow_strings(): @@ -55,14 +55,14 @@ def test_union_with_narrow_strings(): # Verify conversion works without dtype errors assert len(df) == 3 - # The x column should have "nan" for missing values - assert df["x"].values[1] == "nan" + # The x column should have None for missing values + assert df["x"].values[1] is None def test_single_char_strings_with_none(): """Test very short strings with None values.""" # Single character strings with None values - # This is the edge case that needs dtype resizing + # This tests that even very short strings work correctly array = ak.Array([["a", "b"], [None, "c"], ["d", None]]) df = ak.operations.to_dataframe(array) @@ -73,8 +73,8 @@ def test_single_char_strings_with_none(): assert "b" in values assert "c" in values assert "d" in values - # Check that "nan" string appears for None values - assert values.count("nan") == 2 + # Check that None appears for None values + assert values.count(None) == 2 def test_single_byte_bytestrings_with_none(): @@ -90,13 +90,13 @@ def test_single_byte_bytestrings_with_none(): assert b"b" in values assert b"c" in values assert b"d" in values - # Check that b"nan" appears for None values - assert values.count(b"nan") == 2 + # Check that None appears for None values + assert values.count(None) == 2 def test_two_char_strings_with_none(): """Test two-character strings with None values (edge case).""" - # Two character strings - exactly the edge case where "nan" (3 chars) won't fit + # Two character strings - tests that any string length works correctly array = ak.Array([["ab", None], [None, "cd"]]) df = ak.operations.to_dataframe(array) @@ -104,5 +104,5 @@ def test_two_char_strings_with_none(): values = df["values"].values.tolist() assert "ab" in values assert "cd" in values - # "nan" should appear (3 characters) - assert values.count("nan") == 2 + # None should appear for None values + assert values.count(None) == 2 diff --git a/tests/test_3713_to_dataframe_none_vs_nan_string.py b/tests/test_3713_to_dataframe_none_vs_nan_string.py new file mode 100644 index 0000000000..918fd044c6 --- /dev/null +++ b/tests/test_3713_to_dataframe_none_vs_nan_string.py @@ -0,0 +1,74 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE + +from __future__ import annotations + +import pytest + +import awkward as ak + +pandas = pytest.importorskip("pandas") + + +def test_none_vs_nan_string(): + """Test that None and 'nan' string are distinguishable in to_dataframe.""" + # Create array with None and literal "nan" string + array = ak.Array([None, "nan"]) + df = ak.to_dataframe(array) + result = df.to_dict() + + # None should be None, not the string "nan" + assert result["values"][0] is None + assert result["values"][1] == "nan" + # They should be distinguishable + assert result["values"][0] != result["values"][1] + + +def test_none_vs_nan_bytestring(): + """Test that None and b'nan' bytestring are distinguishable in to_dataframe.""" + # Create array with None and literal b"nan" bytestring + array = ak.Array([None, b"nan"]) + df = ak.to_dataframe(array) + result = df.to_dict() + + # None should be None, not the bytestring b"nan" + assert result["values"][0] is None + assert result["values"][1] == b"nan" + # They should be distinguishable + assert result["values"][0] != result["values"][1] + + +def test_nested_list_with_none_and_nan_string(): + """Test nested lists containing both None and 'nan' string.""" + array = ak.Array([["a", None, "nan"], ["b", "nan", None]]) + df = ak.to_dataframe(array) + + # Check that None and "nan" are distinguishable + values = df["values"].values + assert values[0] == "a" + assert values[1] is None + assert values[2] == "nan" + assert values[3] == "b" + assert values[4] == "nan" + assert values[5] is None + + +def test_record_with_none_and_nan_string(): + """Test records containing both None and 'nan' string.""" + array = ak.Array( + [ + {"x": "value", "y": None}, + {"x": "nan", "y": "another"}, + {"x": None, "y": "nan"}, + ] + ) + df = ak.to_dataframe(array) + + # Check x column + assert df["x"].values[0] == "value" + assert df["x"].values[1] == "nan" + assert df["x"].values[2] is None + + # Check y column + assert df["y"].values[0] is None + assert df["y"].values[1] == "another" + assert df["y"].values[2] == "nan"