Support Pandas 3 (#302)

neuralsorcerer · meta-codesync[bot] · commit 6d34f0120b85 · 2026-02-03T10:15:23.000-08:00
Summary: Pull Request resolved: #302 - Closes #297 Pull Request resolved: #301 Differential Revision: D92157858 Pulled By: talgalili fbshipit-source-id: ecf3b8cd3df62717755549ba13831bd4ce3dd75a
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,18 +11,22 @@
   - Added paired outcome-weight impact tests (`y*w0` vs `y*w1`) with confidence intervals.
   - Exposed in `BalanceDFOutcomes`, `Sample.diagnostics()`, and the CLI via
     `--weights_impact_on_outcome_method`.
+- **Pandas 3 support**
+  - Updated compatibility and tests for pandas 3.x
 
 ## Bug Fixes
 
 - **Removed deprecated setup build**
   - Replaced deprecated `setup.py` with `pyproject.toml` build in CI to avoid build failure.
 - **Hardened ID column candidate validation**
   - `guess_id_column()` now ignores duplicate candidate names and validates that candidates are non-empty strings.
+- **Hardened pandas 3 compatibility paths**
+  - Updated string/NA handling and discrete checks for pandas 3 dtypes, and refreshed tests to accept string-backed dtypes.
 
 ## Packaging & Tests
 
-- **Pandas 2.x compatibility and upper bound (<3.0.0)**
-  - Constrained the pandas dependency to `>=2,<3.0.0` to avoid untested pandas 3.x API and dtype changes.
+- **Pandas 3.x compatibility**
+  - Expanded the pandas dependency range to allow pandas 3.x releases.
 
 ## Breaking Changes
 
diff --git a/README.md b/README.md
@@ -69,8 +69,8 @@ REQUIRES = [
     # Numpy and pandas: carefully versioned for binary compatibility
     "numpy>=1.21.0,<2.0; python_version<'3.12'",
     "numpy>=1.24.0; python_version>='3.12'",
-    "pandas>=1.5.0,<2.4.0; python_version<'3.12'",
-    "pandas>=2.0.0; python_version>='3.12'",
+    "pandas>=1.5.0,<4.0.0; python_version<'3.12'",
+    "pandas>=2.0.0,<4.0.0; python_version>='3.12'",
     # Scientific stack
     "scipy>=1.7.0,<1.14.0; python_version<'3.12'",
     "scipy>=1.11.0; python_version>='3.12'",
diff --git a/balance/adjustment.py b/balance/adjustment.py
@@ -211,7 +211,7 @@ def trim_weights(
     original_name = getattr(weights, "name", None)
 
     if isinstance(weights, pd.Series):
-        weights = weights.astype(np.float64, copy=False)
+        weights = weights.astype(np.float64)
     elif isinstance(weights, (np.ndarray, list, tuple)):
         weights = pd.Series(
             np.asarray(weights, dtype=np.float64), dtype=np.float64, name=original_name
diff --git a/balance/balancedf_class.py b/balance/balancedf_class.py
@@ -2372,7 +2372,7 @@ def summary(
             target_clause = f"Response rates (in the target):\n {target_response_rates}"
 
         n_outcomes = self.df.shape[1]
-        list_outcomes = self.df.columns.values
+        list_outcomes = np.array(self.df.columns, dtype=object)
         mean_outcomes_with_ci = mean_outcomes_with_ci
         relative_response_rates = relative_response_rates
         target_clause = target_clause
@@ -2458,6 +2458,18 @@ def __init__(self: "BalanceDFWeights", sample: Sample) -> None:
         """
         super().__init__(sample.weight_column.to_frame(), sample, name="weights")
 
+    @property
+    def df(self: "BalanceDFWeights") -> pd.DataFrame:
+        """Return the current weight column as a DataFrame.
+
+        Args:
+            self (BalanceDFWeights): The BalanceDFWeights instance.
+
+        Returns:
+            pd.DataFrame: DataFrame containing the current weight column.
+        """
+        return self._sample.weight_column.to_frame()
+
     # TODO: maybe add better control if there are no weights for unadjusted or target (the current default shows them in the legend, but not in the figure)
     def plot(
         self: "BalanceDFWeights", on_linked_samples: bool = True, **kwargs: Any
diff --git a/balance/sample_class.py b/balance/sample_class.py
@@ -11,6 +11,7 @@
 import inspect
 import logging
 from copy import deepcopy
+from importlib.metadata import version as importlib_version
 from typing import Any, Callable, Dict, List, Literal
 
 import numpy as np
@@ -499,16 +500,23 @@ def from_frame(
             #           for x in df.columns:
             #               if (is_numeric_dtype(df[x])) and (not is_bool_dtype(df[x])):
             #                   df[x] = df[x].astype("float64")
-            input_type = ["Int64", "Int32", "int64", "int32", "int16", "int8", "string"]
+            input_type = ["Int64", "Int32", "int64", "int32", "int16", "int8"]
             output_type = [
                 "float64",
                 "float32",  # This changes Int32Dtype() into dtype('int32') (from pandas to numpy)
                 "float64",
                 "float32",
                 "float16",
                 "float16",  # Using float16 since float8 doesn't exist, see: https://stackoverflow.com/a/40507235/256662
-                "object",
             ]
+            # TODO:(after 2026) that if pandas >=3, this doesn't cause issues for users importing data from SQL
+            # In pandas < 3, convert string dtype to object for compatibility
+            _pd_version = tuple(
+                int(x) for x in importlib_version("pandas").split(".")[:2]
+            )
+            if _pd_version < (3, 0):
+                input_type.append("string")
+                output_type.append("object")
             for i_input, i_output in zip(input_type, output_type):
                 sample._df = balance_util._pd_convert_all_types(
                     sample._df, i_input, i_output
@@ -1122,7 +1130,8 @@ def set_weights(self, weights: pd.Series | float | None) -> None:
                 ].astype("float64")
 
             # Now assign the weights
-            self._df.loc[:, self.weight_column.name] = weights
+            weights_value = np.nan if weights is None else weights
+            self._df.loc[:, self.weight_column.name] = weights_value
 
         self.weight_column = self._df[self.weight_column.name]
 
diff --git a/balance/stats_and_plots/weighted_stats.py b/balance/stats_and_plots/weighted_stats.py
@@ -87,7 +87,7 @@ def _prepare_weighted_stat_args(
 
     dtypes = v.dtypes if hasattr(v.dtypes, "__iter__") else [v.dtypes]
 
-    if not all(np.issubdtype(x, np.number) for x in dtypes):
+    if not all(pd.api.types.is_numeric_dtype(x) for x in dtypes):
         raise TypeError("all columns must be numeric")
 
     if inf_rm:
diff --git a/balance/testutil.py b/balance/testutil.py
@@ -157,6 +157,10 @@ def assertEqual(
         lazy: bool = kwargs.get("lazy", False)
         if isinstance(first, np.ndarray) or isinstance(second, np.ndarray):
             np.testing.assert_array_equal(first, second, **kwargs)
+        elif isinstance(first, pd.api.extensions.ExtensionArray) or isinstance(
+            second, pd.api.extensions.ExtensionArray
+        ):
+            np.testing.assert_array_equal(np.array(first), np.array(second), **kwargs)
         elif isinstance(first, pd.DataFrame) or isinstance(second, pd.DataFrame):
             _assert_frame_equal_lazy(
                 first,
diff --git a/balance/utils/data_transformation.py b/balance/utils/data_transformation.py
@@ -67,7 +67,7 @@ def add_na_indicator(
             filled_col = (
                 df[c].cat.add_categories(replace_val_obj).fillna(replace_val_obj)
             )
-            df[c] = filled_col.infer_objects(copy=False)
+            df[c] = filled_col.infer_objects()
         elif c in non_numeric_cols:
             df[c] = _safe_fillna_and_infer(df[c], replace_val_obj)
         else:
@@ -319,19 +319,21 @@ def fct_lump(s: pd.Series, prop: float = 0.05) -> pd.Series:
         props = s.value_counts() / s.shape[0]
 
     # Ensure proper dtype inference on the index
-    props.index = props.index.infer_objects(copy=False)
+    props.index = props.index.infer_objects()
 
     small_categories = props[props < prop].index.tolist()
 
     remainder_category_name = "_lumped_other"
     while remainder_category_name in props.index:
         remainder_category_name = remainder_category_name * 2
 
-    # Convert to object dtype
-    s = s.astype("object")
+    # Convert to object dtype unless already string dtype
+    if not pd.api.types.is_string_dtype(s.dtype):
+        s = s.astype("object")
 
     # Replace small categories with the remainder category name
-    s.loc[s.apply(lambda x: x in small_categories)] = remainder_category_name
+    mask = s.isin(small_categories).fillna(False)
+    s.loc[mask] = remainder_category_name
     return s
 
 
@@ -349,12 +351,12 @@ def fct_lump_by(s: pd.Series, by: pd.Series, prop: float = 0.05) -> pd.Series:
         pd.Series: pd.series, we keep the index of s as the index of the result.
     """
     res = copy.deepcopy(s)
-    pd.options.mode.copy_on_write = True
     # pandas groupby doesnt preserve order
     for subgroup in pd.unique(by):
         mask = by == subgroup
         grouped_res = fct_lump(res.loc[mask], prop=prop)
         # Ensure dtype compatibility before assignment
-        res = res.astype("object")
+        if not pd.api.types.is_string_dtype(res.dtype):
+            res = res.astype("object")
         res.loc[mask] = grouped_res
     return res
diff --git a/balance/utils/input_validation.py b/balance/utils/input_validation.py
@@ -199,6 +199,7 @@ def _is_discrete_series(series: pd.Series) -> bool:
     return (
         is_binary_indicator
         or pd.api.types.is_object_dtype(series)
+        or pd.api.types.is_string_dtype(series)
         or isinstance(series.dtype, pd.CategoricalDtype)
         or pd.api.types.is_bool_dtype(series)
     )
@@ -351,6 +352,7 @@ def _is_arraylike(o: Any) -> bool:
     return (
         isinstance(o, np.ndarray)
         or isinstance(o, pd.Series)
+        or isinstance(o, pd.api.extensions.ExtensionArray)
         or (
             hasattr(pd.arrays, "NumpyExtensionArray")
             and isinstance(o, pd.arrays.NumpyExtensionArray)
@@ -400,7 +402,9 @@ def _return_type_creation_function(x: Any) -> Callable | Any:
         if isinstance(x, np.ndarray):
             return lambda obj: np.array(obj, dtype=x.dtype)
         # same with pd.arrays.PandasArray, pd.arrays.StringArray, etc.
-        elif "pandas.core.arrays" in str(type(x)):
+        elif isinstance(x, pd.api.extensions.ExtensionArray) or (
+            "pandas.core.arrays" in str(type(x))
+        ):
             return lambda obj: pd.array(obj, dtype=x.dtype)
         else:
             return type(x)
diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
@@ -378,7 +378,7 @@ def _prepare_input_model_matrix(
     if fix_columns_names:
         all_data.columns = all_data.columns.str.replace(
             r"[^\w]", "_", regex=True
-        ).infer_objects(copy=False)
+        ).infer_objects()
         all_data = _make_df_column_names_unique(all_data)
 
     return {"all_data": all_data, "sample_n": sample_n}
diff --git a/balance/utils/pandas_utils.py b/balance/utils/pandas_utils.py
@@ -249,13 +249,24 @@ def _safe_replace_and_infer(
         to_replace = [np.inf, -np.inf]
     if value is None:
         value = np.nan
+    original_dtypes = data.dtypes if isinstance(data, pd.DataFrame) else data.dtype
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
             message="Downcasting behavior in `replace` is deprecated.*",
             category=FutureWarning,
         )
-        return data.replace(to_replace, value).infer_objects(copy=False)
+        result = data.replace(to_replace, value).infer_objects()
+
+    if isinstance(result, pd.Series):
+        if original_dtypes == "object" and result.dtype != "object":
+            result = result.astype("object")
+        return result
+
+    object_cols = [col for col, dtype in original_dtypes.items() if dtype == "object"]
+    if object_cols:
+        result = result.astype(dict.fromkeys(object_cols, "object"))
+    return result
 
 
 def _safe_fillna_and_infer(
@@ -275,12 +286,23 @@ def _safe_fillna_and_infer(
     if value is None:
         value = np.nan
 
+    original_dtypes = data.dtypes if isinstance(data, pd.DataFrame) else data.dtype
     # Suppress pandas FutureWarnings about downcasting during fillna operations
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", FutureWarning)
         filled_data = data.fillna(value)
 
-    return filled_data.infer_objects(copy=False)
+    result = filled_data.infer_objects()
+
+    if isinstance(result, pd.Series):
+        if original_dtypes == "object" and result.dtype != "object":
+            result = result.astype("object")
+        return result
+
+    object_cols = [col for col, dtype in original_dtypes.items() if dtype == "object"]
+    if object_cols:
+        result = result.astype(dict.fromkeys(object_cols, "object"))
+    return result
 
 
 def _safe_groupby_apply(
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,8 +35,8 @@ classifiers = [
 dependencies = [
     "numpy>=1.21.0,<2.0; python_version<'3.12'",
     "numpy>=1.24.0; python_version>='3.12'",
-    "pandas>=1.5.0,<2.4.0; python_version<'3.12'",
-    "pandas>=2.0.0,<3.0.0; python_version>='3.12'",
+    "pandas>=1.5.0,<4.0.0; python_version<'3.12'",
+    "pandas>=2.0.0,<4.0.0; python_version>='3.12'",
     "scipy>=1.7.0,<1.14.0; python_version<'3.12'",
     "scipy>=1.11.0; python_version>='3.12'",
     "scikit-learn>=1.0.0,<1.4.0; python_version<'3.12'",
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -124,7 +124,10 @@ def test_load_sim_data_structure_and_types(self) -> None:
 
         # Check column types for both dataframes
         for df in [target_df, sample_df]:
-            self.assertEqual(df["id"].dtype, object)  # String type
+            self.assertTrue(
+                pd.api.types.is_object_dtype(df["id"].dtype)
+                or pd.api.types.is_string_dtype(df["id"].dtype)
+            )  # String type
             self.assertTrue(
                 df["gender"].dtype == object
                 or pd.api.types.is_string_dtype(df["gender"])
diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -422,6 +422,32 @@ def test_Sample_from_frame_type_conversion(self) -> None:
         self.assertEqual(Sample.from_frame(df).df.a.dtype.type, np.float16)
         # TODO: add tests for other types of conversions
 
+    def test_Sample_from_frame_string_dtype_conversion_pandas2(self) -> None:
+        """Test string dtype to object conversion for pandas < 3.0.
+
+        Verifies that from_frame correctly converts string dtype columns
+        to object dtype when running on pandas versions < 3.0. This is
+        necessary for compatibility with older pandas versions that don't
+        handle string dtype consistently.
+        """
+        from unittest.mock import patch
+
+        # Create a DataFrame with a string dtype column
+        df = pd.DataFrame(
+            {"id": (1, 2), "a": ("x", "y")},
+        )
+        df["a"] = df["a"].astype("string")
+        self.assertEqual(str(df.a.dtype), "string")
+
+        # Mock importlib.metadata.version to return pandas 2.x
+        with patch(
+            "balance.sample_class.importlib_version",
+            return_value="2.2.0",
+        ):
+            sample = Sample.from_frame(df)
+            # In pandas < 3.0, string dtype should be converted to object
+            self.assertEqual(sample.df.a.dtype, np.object_)
+
     def test_Sample_from_frame_deepcopy_behavior(self) -> None:
         """Test deepcopy parameter behavior.
 
@@ -431,15 +457,24 @@ def test_Sample_from_frame_deepcopy_behavior(self) -> None:
         # Test with use_deepcopy=False - original DataFrame should be modified
         df = pd.DataFrame({"id": (1, 2), "a": (1, 2)})
         self.assertEqual(df.id.dtype.type, np.int64)
-        self.assertEqual(
-            Sample.from_frame(df, use_deepcopy=False).df.id.dtype.type, np.object_
+        dtype_type = Sample.from_frame(df, use_deepcopy=False).df.id.dtype
+        self.assertTrue(
+            pd.api.types.is_object_dtype(dtype_type)
+            or pd.api.types.is_string_dtype(dtype_type)
+        )
+        self.assertTrue(
+            pd.api.types.is_object_dtype(df.id.dtype)
+            or pd.api.types.is_string_dtype(df.id.dtype)
         )
-        self.assertEqual(df.id.dtype.type, np.object_)
 
         # Test with use_deepcopy=True (default) - original DataFrame should be preserved
         df = pd.DataFrame({"id": (1, 2), "a": (1, 2)})
         self.assertEqual(df.id.dtype.type, np.int64)
-        self.assertEqual(Sample.from_frame(df).df.id.dtype.type, np.object_)
+        dtype_type = Sample.from_frame(df).df.id.dtype
+        self.assertTrue(
+            pd.api.types.is_object_dtype(dtype_type)
+            or pd.api.types.is_string_dtype(dtype_type)
+        )
         self.assertEqual(df.id.dtype.type, np.int64)
 
     def test_Sample_adjust(self) -> None:
@@ -1603,16 +1638,9 @@ def get_sample_to_adjust(
         smpl_to_adj = get_sample_to_adjust(df)
         self.assertIsInstance(smpl_to_adj.adjust(method="ipw"), Sample)
 
-        # This should raise a TypeError:
-        with self.assertRaisesRegex(
-            TypeError,
-            "boolean value of NA is ambiguous",
-        ):
-            smpl_to_adj = get_sample_to_adjust(df)
-            # smpl_to_adj._df.iloc[0, 0] = pd.NA
-            smpl_to_adj._df.iloc[0, 1] = pd.NA
-            # This will raise the error:
-            smpl_to_adj.adjust(method="ipw")
+        smpl_to_adj = get_sample_to_adjust(df)
+        smpl_to_adj._df.iloc[0, 1] = pd.NA
+        self.assertIsInstance(smpl_to_adj.adjust(method="ipw"), Sample)
 
         # This works fine
         df.iloc[0, 0] = np.nan
@@ -1626,15 +1654,9 @@ def get_sample_to_adjust(
         smpl_to_adj = get_sample_to_adjust(df)
         self.assertIsInstance(smpl_to_adj.adjust(method="ipw"), Sample)
 
-        # Turning standardize_types to False should raise a TypeError (since we have pd.NA):
-        with self.assertRaisesRegex(
-            TypeError,
-            "boolean value of NA is ambiguous",
-        ):
-            # df.iloc[0, 0] = pd.NA
-            df.iloc[0, 1] = pd.NA
-            smpl_to_adj = get_sample_to_adjust(df, standardize_types=False)
-            smpl_to_adj.adjust(method="ipw")
+        df.iloc[0, 1] = pd.NA
+        smpl_to_adj = get_sample_to_adjust(df, standardize_types=False)
+        self.assertIsInstance(smpl_to_adj.adjust(method="ipw"), Sample)
 
 
 class TestSample_high_cardinality_warnings(balance.testutil.BalanceTestCase):
diff --git a/tests/test_sample_diagnostics_helper.py b/tests/test_sample_diagnostics_helper.py
diff --git a/tests/test_util_data_transformation.py b/tests/test_util_data_transformation.py
diff --git a/tests/test_util_pandas_utils.py b/tests/test_util_pandas_utils.py