Skip to content

Commit bd1c5bc

Browse files
neuralsorcerermeta-codesync[bot]
authored andcommitted
Handle duplicate-column suffix collisions in pandas utils (#347)
Summary: Pull Request resolved: #347 Differential Revision: D94329024 Pulled By: talgalili fbshipit-source-id: ed89d92201fa24ecf0abb66c50531cee31c4d423
1 parent bb86743 commit bd1c5bc

File tree

3 files changed

+55
-8
lines changed

3 files changed

+55
-8
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@
4242
`weighted_median_breakdown_point` now explicitly normalize DataFrame inputs
4343
to their first column before computation, matching validation behavior and
4444
returning scalar/Series outputs consistently.
45+
- **Robust duplicate model-matrix column renaming**
46+
- `_make_df_column_names_unique()` now avoids suffix collisions when columns
47+
like `a`, `a_1`, and repeated `a` names appear together.
48+
- Duplicate columns are now renamed deterministically to guaranteed-unique
49+
names, preventing downstream clashes after formula sanitization.
4550

4651
## Tests
4752

balance/utils/pandas_utils.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import copy
1111
import logging
1212
import warnings
13-
from typing import Any, Dict, NamedTuple
13+
from typing import Any, Dict, NamedTuple, Set
1414

1515
import numpy as np
1616
import pandas as pd
@@ -578,20 +578,26 @@ def _make_df_column_names_unique(df: pd.DataFrame) -> pd.DataFrame:
578578
A suffix will be added to them but their order might change from one iteration to another.
579579
To avoid issues, make sure to change your original column names to be unique (and without special characters)."""
580580
)
581-
col_counts = {}
581+
col_counts: Dict[Any, int] = {}
582+
used_names: Set[Any] = set()
582583
new_columns = []
583584

584585
for col in df.columns:
585-
if col in col_counts:
586-
col_counts[col] += 1
587-
new_col_name = f"{col}_{col_counts[col]}"
586+
if col in used_names:
587+
next_suffix = col_counts.get(col, 0) + 1
588+
new_col_name = f"{col}_{next_suffix}"
589+
while new_col_name in used_names:
590+
next_suffix += 1
591+
new_col_name = f"{col}_{next_suffix}"
592+
col_counts[col] = next_suffix
588593
logger.warning(
589594
f"Column {col} already exists in the DataFrame, renaming it to be {new_col_name}"
590595
)
591596
else:
592-
col_counts[col] = 0
597+
col_counts.setdefault(col, 0)
593598
new_col_name = col
594599
new_columns.append(new_col_name)
600+
used_names.add(new_col_name)
595601

596602
df.columns = new_columns
597603

tests/test_util_pandas_utils.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,6 @@ def test__make_df_column_names_unique(self) -> None:
204204
df1 = pd.DataFrame(data)
205205
df1.columns = ["A", "B", "A", "A"]
206206

207-
# TODO: understand in the future why the names here appear to be consistent while when using the function in
208-
# `model_matrix` it does not appear to work.
209207
self.assertEqual(
210208
_make_df_column_names_unique(df1).to_dict(),
211209
{
@@ -216,6 +214,34 @@ def test__make_df_column_names_unique(self) -> None:
216214
},
217215
)
218216

217+
def test__make_df_column_names_unique_existing_suffixes(self) -> None:
218+
"""Ensure duplicate renaming does not collide with existing suffixed names."""
219+
220+
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "a_1", "a", "a_2"])
221+
222+
result = _make_df_column_names_unique(df)
223+
224+
self.assertEqual(result.columns.tolist(), ["a", "a_1", "a_2", "a_2_1"])
225+
226+
def test__make_df_column_names_unique_non_string_columns(self) -> None:
227+
"""Ensure duplicate renaming works for non-string column labels."""
228+
229+
df = pd.DataFrame([[1, 2, 3, 4]], columns=[1, "1_1", 1, "1_2"])
230+
231+
result = _make_df_column_names_unique(df)
232+
233+
self.assertEqual(result.columns.tolist(), [1, "1_1", "1_2", "1_2_1"])
234+
235+
def test__make_df_column_names_unique_when_already_unique(self) -> None:
236+
"""Return the input DataFrame unchanged when columns are already unique."""
237+
238+
df = pd.DataFrame([[1, 2]], columns=["a", "b"])
239+
240+
result = _make_df_column_names_unique(df)
241+
242+
self.assertIs(result, df)
243+
self.assertEqual(result.columns.tolist(), ["a", "b"])
244+
219245
def test__safe_replace_and_infer(self) -> None:
220246
"""Test safe replacement and dtype inference to avoid pandas deprecation warnings."""
221247
# Test with Series containing infinities
@@ -268,6 +294,16 @@ def test__safe_fillna_and_infer(self) -> None:
268294
expected = pd.DataFrame({"a": [1.0, -1.0, 2.0], "b": [-1.0, 3.0, 4.0]})
269295
pd.testing.assert_frame_equal(result, expected)
270296

297+
# Test with DataFrame containing object columns to cover object-cast branch
298+
df_with_obj = pd.DataFrame({"a": ["x", None], "b": [1, None]}, dtype=object)
299+
result = _safe_fillna_and_infer(df_with_obj, value="missing")
300+
expected = pd.DataFrame(
301+
{"a": ["x", "missing"], "b": [1, "missing"]}, dtype=object
302+
)
303+
pd.testing.assert_frame_equal(result, expected)
304+
self.assertEqual(result["a"].dtype, object)
305+
self.assertEqual(result["b"].dtype, object)
306+
271307
# Test with string replacement
272308
series_str = pd.Series(["a", None, "c"])
273309
result = _safe_fillna_and_infer(series_str, value="_NA")

0 commit comments

Comments
 (0)