From a6e9f6613cb379ab3047908b94038415bbbc47e7 Mon Sep 17 00:00:00 2001 From: aryansri05 Date: Mon, 1 Dec 2025 19:43:20 +0530 Subject: [PATCH 1/4] Fix boolean casting consistency with Pandas (#20746) Signed-off-by: aryansri05 --- python/cudf/cudf/core/column/numerical.py | 16 ++++++++- python/cudf/cudf/tests/test_issue_20746.py | 42 ++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/tests/test_issue_20746.py diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 6beeae83005..b9ab1566573 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -576,6 +576,20 @@ def as_numerical_column(self, dtype: DtypeObj) -> NumericalColumn: res = self.nans_to_nulls().cast(dtype=dtype) res._dtype = dtype return res # type: ignore[return-value] + + # --- FIX: Match Pandas behavior when casting Float(with Nulls) -> Bool --- + # Pandas treats NaN as truthy (True) when casting float -> bool. + # In cuDF, Nulls propagate. We must fill Nulls with np.nan so the + # cast treats them as True. + if ( + self.dtype.kind == "f" + and dtype.kind == "b" + and not is_pandas_nullable_extension_dtype(dtype) + and self.has_nulls() + ): + return self.fillna(np.nan).cast(dtype=dtype) # type: ignore[return-value] + # ------------------------------------------------------------------------ + if dtype_to_pylibcudf_type(dtype) == dtype_to_pylibcudf_type( self.dtype ): @@ -1039,4 +1053,4 @@ def _normalize_find_and_replace_input( ) if not normalized_column.can_cast_safely(input_column_dtype): return normalized_column - return normalized_column.astype(input_column_dtype) + return normalized_column.astype(input_column_dtype) \ No newline at end of file diff --git a/python/cudf/cudf/tests/test_issue_20746.py b/python/cudf/cudf/tests/test_issue_20746.py new file mode 100644 index 00000000000..3d1411810f6 --- /dev/null +++ b/python/cudf/cudf/tests/test_issue_20746.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest +import numpy as np +import pandas as pd +import cudf +from cudf.testing import assert_series_equal + +def test_cast_float_nan_to_bool_pandas_compat(): + """ + Regression test for Issue #20746. + Ensures that casting float columns with NaNs to boolean + treats NaNs as True (matching Pandas behavior) when + mode.pandas_compatible is enabled. + """ + # Enable pandas compatibility mode + cudf.set_option("mode.pandas_compatible", True) + + try: + data = [1.0, 0.0, np.nan, None] + + # Create cuDF Series + gs = cudf.Series(data, dtype="float64") + + # Cast to bool + got = gs.astype("bool") + + # Create expected Pandas Series (Pandas casts NaN/None to True) + expected = pd.Series([True, False, True, True], dtype="bool") + + # Verify + # In Pandas compat mode, we expect NO nulls in the boolean result + assert got.null_count == 0 + + # Convert to pandas for easy comparison or use testing utils + expected_cudf = cudf.Series(expected) + + assert_series_equal(got, expected_cudf) + + finally: + # Reset option to avoid side effects on other tests + cudf.set_option("mode.pandas_compatible", False) From 775ddc7e44cf3dda4475f6ad056c7d98bdd73768 Mon Sep 17 00:00:00 2001 From: aryansri05 Date: Mon, 1 Dec 2025 20:13:17 +0530 Subject: [PATCH 2/4] Add license headers and fix formatting Signed-off-by: aryansri05 --- python/cudf/cudf/tests/test_issue_20746.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/tests/test_issue_20746.py b/python/cudf/cudf/tests/test_issue_20746.py index 3d1411810f6..03f20fd502e 100644 --- a/python/cudf/cudf/tests/test_issue_20746.py +++ b/python/cudf/cudf/tests/test_issue_20746.py @@ -1,4 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 import pytest import numpy as np From ead53e14a0db5f8cb44bff0d5bbc47b9b1834933 Mon Sep 17 00:00:00 2001 From: aryansri05 Date: Mon, 1 Dec 2025 20:28:36 +0530 Subject: [PATCH 3/4] Fix ruff formatting errors Signed-off-by: aryansri05 --- python/cudf/cudf/core/column/numerical.py | 10 +++++----- python/cudf/cudf/tests/test_issue_20746.py | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b9ab1566573..bde8399c553 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -490,7 +490,7 @@ def as_string_column(self, dtype: DtypeObj) -> StringColumn: ) if len(self) == 0: return cast( - cudf.core.column.StringColumn, + "cudf.core.column.StringColumn", column_empty(0, dtype=CUDF_STRING_DTYPE), ) @@ -576,7 +576,7 @@ def as_numerical_column(self, dtype: DtypeObj) -> NumericalColumn: res = self.nans_to_nulls().cast(dtype=dtype) res._dtype = dtype return res # type: ignore[return-value] - + # --- FIX: Match Pandas behavior when casting Float(with Nulls) -> Bool --- # Pandas treats NaN as truthy (True) when casting float -> bool. # In cuDF, Nulls propagate. We must fill Nulls with np.nan so the @@ -760,7 +760,7 @@ def find_and_replace( replacement_col = replacement_col.repeat(len(to_replace_col)) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() - replaced = cast(Self, self.astype(common_type)) + replaced = cast("Self", self.astype(common_type)) df = cudf.DataFrame._from_data( { "old": to_replace_col.astype(common_type), @@ -924,7 +924,7 @@ def _with_type_metadata( ) -> ColumnBase: if isinstance(dtype, CategoricalDtype): codes_dtype = min_unsigned_type(len(dtype.categories)) - codes = cast(NumericalColumn, self.astype(codes_dtype)) + codes = cast("NumericalColumn", self.astype(codes_dtype)) return CategoricalColumn( plc_column=codes.to_pylibcudf(mode="read"), size=codes.size, @@ -1053,4 +1053,4 @@ def _normalize_find_and_replace_input( ) if not normalized_column.can_cast_safely(input_column_dtype): return normalized_column - return normalized_column.astype(input_column_dtype) \ No newline at end of file + return normalized_column.astype(input_column_dtype) diff --git a/python/cudf/cudf/tests/test_issue_20746.py b/python/cudf/cudf/tests/test_issue_20746.py index 03f20fd502e..cd9df41ee01 100644 --- a/python/cudf/cudf/tests/test_issue_20746.py +++ b/python/cudf/cudf/tests/test_issue_20746.py @@ -1,12 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 -import pytest import numpy as np import pandas as pd + import cudf from cudf.testing import assert_series_equal + def test_cast_float_nan_to_bool_pandas_compat(): """ Regression test for Issue #20746. @@ -16,28 +17,28 @@ def test_cast_float_nan_to_bool_pandas_compat(): """ # Enable pandas compatibility mode cudf.set_option("mode.pandas_compatible", True) - + try: data = [1.0, 0.0, np.nan, None] - + # Create cuDF Series gs = cudf.Series(data, dtype="float64") - + # Cast to bool got = gs.astype("bool") - + # Create expected Pandas Series (Pandas casts NaN/None to True) expected = pd.Series([True, False, True, True], dtype="bool") - + # Verify # In Pandas compat mode, we expect NO nulls in the boolean result assert got.null_count == 0 - + # Convert to pandas for easy comparison or use testing utils expected_cudf = cudf.Series(expected) - + assert_series_equal(got, expected_cudf) - + finally: # Reset option to avoid side effects on other tests cudf.set_option("mode.pandas_compatible", False) From daf4f9c24c694f1e8571f658c8b8695ae7def837 Mon Sep 17 00:00:00 2001 From: aryansri05 Date: Wed, 3 Dec 2025 00:26:16 +0530 Subject: [PATCH 4/4] Trigger CI restart