Handle duplicate-column suffix collisions in pandas utils (#347)

neuralsorcerer · meta-codesync[bot] · commit bd1c5bc5985e · 2026-02-25T01:14:48.000-08:00
Summary: Pull Request resolved: #347 Differential Revision: D94329024 Pulled By: talgalili fbshipit-source-id: ed89d92201fa24ecf0abb66c50531cee31c4d423
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -42,6 +42,11 @@
     `weighted_median_breakdown_point` now explicitly normalize DataFrame inputs
     to their first column before computation, matching validation behavior and
     returning scalar/Series outputs consistently.
+- **Robust duplicate model-matrix column renaming**
+  - `_make_df_column_names_unique()` now avoids suffix collisions when columns
+    like `a`, `a_1`, and repeated `a` names appear together.
+  - Duplicate columns are now renamed deterministically to guaranteed-unique
+    names, preventing downstream clashes after formula sanitization.
 
 ## Tests
 
diff --git a/balance/utils/pandas_utils.py b/balance/utils/pandas_utils.py
@@ -10,7 +10,7 @@
 import copy
 import logging
 import warnings
-from typing import Any, Dict, NamedTuple
+from typing import Any, Dict, NamedTuple, Set
 
 import numpy as np
 import pandas as pd
@@ -578,20 +578,26 @@ def _make_df_column_names_unique(df: pd.DataFrame) -> pd.DataFrame:
                     A suffix will be added to them but their order might change from one iteration to another.
                     To avoid issues, make sure to change your original column names to be unique (and without special characters)."""
     )
-    col_counts = {}
+    col_counts: Dict[Any, int] = {}
+    used_names: Set[Any] = set()
     new_columns = []
 
     for col in df.columns:
-        if col in col_counts:
-            col_counts[col] += 1
-            new_col_name = f"{col}_{col_counts[col]}"
+        if col in used_names:
+            next_suffix = col_counts.get(col, 0) + 1
+            new_col_name = f"{col}_{next_suffix}"
+            while new_col_name in used_names:
+                next_suffix += 1
+                new_col_name = f"{col}_{next_suffix}"
+            col_counts[col] = next_suffix
             logger.warning(
                 f"Column {col} already exists in the DataFrame, renaming it to be {new_col_name}"
             )
         else:
-            col_counts[col] = 0
+            col_counts.setdefault(col, 0)
             new_col_name = col
         new_columns.append(new_col_name)
+        used_names.add(new_col_name)
 
     df.columns = new_columns
 
diff --git a/tests/test_util_pandas_utils.py b/tests/test_util_pandas_utils.py
@@ -204,8 +204,6 @@ def test__make_df_column_names_unique(self) -> None:
         df1 = pd.DataFrame(data)
         df1.columns = ["A", "B", "A", "A"]
 
-        # TODO: understand in the future why the names here appear to be consistent while when using the function in
-        # `model_matrix` it does not appear to work.
         self.assertEqual(
             _make_df_column_names_unique(df1).to_dict(),
             {
@@ -216,6 +214,34 @@ def test__make_df_column_names_unique(self) -> None:
             },
         )
 
+    def test__make_df_column_names_unique_existing_suffixes(self) -> None:
+        """Ensure duplicate renaming does not collide with existing suffixed names."""
+
+        df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "a_1", "a", "a_2"])
+
+        result = _make_df_column_names_unique(df)
+
+        self.assertEqual(result.columns.tolist(), ["a", "a_1", "a_2", "a_2_1"])
+
+    def test__make_df_column_names_unique_non_string_columns(self) -> None:
+        """Ensure duplicate renaming works for non-string column labels."""
+
+        df = pd.DataFrame([[1, 2, 3, 4]], columns=[1, "1_1", 1, "1_2"])
+
+        result = _make_df_column_names_unique(df)
+
+        self.assertEqual(result.columns.tolist(), [1, "1_1", "1_2", "1_2_1"])
+
+    def test__make_df_column_names_unique_when_already_unique(self) -> None:
+        """Return the input DataFrame unchanged when columns are already unique."""
+
+        df = pd.DataFrame([[1, 2]], columns=["a", "b"])
+
+        result = _make_df_column_names_unique(df)
+
+        self.assertIs(result, df)
+        self.assertEqual(result.columns.tolist(), ["a", "b"])
+
     def test__safe_replace_and_infer(self) -> None:
         """Test safe replacement and dtype inference to avoid pandas deprecation warnings."""
         # Test with Series containing infinities
@@ -268,6 +294,16 @@ def test__safe_fillna_and_infer(self) -> None:
         expected = pd.DataFrame({"a": [1.0, -1.0, 2.0], "b": [-1.0, 3.0, 4.0]})
         pd.testing.assert_frame_equal(result, expected)
 
+        # Test with DataFrame containing object columns to cover object-cast branch
+        df_with_obj = pd.DataFrame({"a": ["x", None], "b": [1, None]}, dtype=object)
+        result = _safe_fillna_and_infer(df_with_obj, value="missing")
+        expected = pd.DataFrame(
+            {"a": ["x", "missing"], "b": [1, "missing"]}, dtype=object
+        )
+        pd.testing.assert_frame_equal(result, expected)
+        self.assertEqual(result["a"].dtype, object)
+        self.assertEqual(result["b"].dtype, object)
+
         # Test with string replacement
         series_str = pd.Series(["a", None, "c"])
         result = _safe_fillna_and_infer(series_str, value="_NA")