Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@
`weighted_median_breakdown_point` now explicitly normalize DataFrame inputs
to their first column before computation, matching validation behavior and
returning scalar/Series outputs consistently.
- **Robust duplicate model-matrix column renaming**
- `_make_df_column_names_unique()` now avoids suffix collisions when columns
like `a`, `a_1`, and repeated `a` names appear together.
- Duplicate columns are now renamed deterministically to guaranteed-unique
names, preventing downstream clashes after formula sanitization.

## Tests

Expand Down
18 changes: 12 additions & 6 deletions balance/utils/pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import copy
import logging
import warnings
from typing import Any, Dict, NamedTuple
from typing import Any, Dict, NamedTuple, Set

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -578,20 +578,26 @@ def _make_df_column_names_unique(df: pd.DataFrame) -> pd.DataFrame:
A suffix will be added to them but their order might change from one iteration to another.
To avoid issues, make sure to change your original column names to be unique (and without special characters)."""
)
col_counts = {}
col_counts: Dict[Any, int] = {}
used_names: Set[Any] = set()
new_columns = []

for col in df.columns:
if col in col_counts:
col_counts[col] += 1
new_col_name = f"{col}_{col_counts[col]}"
if col in used_names:
next_suffix = col_counts.get(col, 0) + 1
new_col_name = f"{col}_{next_suffix}"
while new_col_name in used_names:
next_suffix += 1
new_col_name = f"{col}_{next_suffix}"
col_counts[col] = next_suffix
logger.warning(
f"Column {col} already exists in the DataFrame, renaming it to be {new_col_name}"
)
else:
col_counts[col] = 0
col_counts.setdefault(col, 0)
new_col_name = col
new_columns.append(new_col_name)
used_names.add(new_col_name)

df.columns = new_columns

Expand Down
40 changes: 38 additions & 2 deletions tests/test_util_pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,6 @@ def test__make_df_column_names_unique(self) -> None:
df1 = pd.DataFrame(data)
df1.columns = ["A", "B", "A", "A"]

# TODO: understand in the future why the names here appear to be consistent while when using the function in
# `model_matrix` it does not appear to work.
self.assertEqual(
_make_df_column_names_unique(df1).to_dict(),
{
Expand All @@ -216,6 +214,34 @@ def test__make_df_column_names_unique(self) -> None:
},
)

def test__make_df_column_names_unique_existing_suffixes(self) -> None:
"""Ensure duplicate renaming does not collide with existing suffixed names."""

df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "a_1", "a", "a_2"])

result = _make_df_column_names_unique(df)

self.assertEqual(result.columns.tolist(), ["a", "a_1", "a_2", "a_2_1"])

def test__make_df_column_names_unique_non_string_columns(self) -> None:
"""Ensure duplicate renaming works for non-string column labels."""

df = pd.DataFrame([[1, 2, 3, 4]], columns=[1, "1_1", 1, "1_2"])

result = _make_df_column_names_unique(df)

self.assertEqual(result.columns.tolist(), [1, "1_1", "1_2", "1_2_1"])

def test__make_df_column_names_unique_when_already_unique(self) -> None:
"""Return the input DataFrame unchanged when columns are already unique."""

df = pd.DataFrame([[1, 2]], columns=["a", "b"])

result = _make_df_column_names_unique(df)

self.assertIs(result, df)
self.assertEqual(result.columns.tolist(), ["a", "b"])

def test__safe_replace_and_infer(self) -> None:
"""Test safe replacement and dtype inference to avoid pandas deprecation warnings."""
# Test with Series containing infinities
Expand Down Expand Up @@ -268,6 +294,16 @@ def test__safe_fillna_and_infer(self) -> None:
expected = pd.DataFrame({"a": [1.0, -1.0, 2.0], "b": [-1.0, 3.0, 4.0]})
pd.testing.assert_frame_equal(result, expected)

# Test with DataFrame containing object columns to cover object-cast branch
df_with_obj = pd.DataFrame({"a": ["x", None], "b": [1, None]}, dtype=object)
result = _safe_fillna_and_infer(df_with_obj, value="missing")
expected = pd.DataFrame(
{"a": ["x", "missing"], "b": [1, "missing"]}, dtype=object
)
pd.testing.assert_frame_equal(result, expected)
self.assertEqual(result["a"].dtype, object)
self.assertEqual(result["b"].dtype, object)

# Test with string replacement
series_str = pd.Series(["a", None, "c"])
result = _safe_fillna_and_infer(series_str, value="_NA")
Expand Down
Loading