Skip to content

Commit 810180a

Browse files
authored
Add support for feature_names_in_ (#7877)
This PR adds support for `feature_names_in_`, implemented through the recently added `check_features` validation. This implementation matches sklearn's implementation, helping us achieve better compatibility. In cases where this new validation check would cause previously working code to error, we now raise a `FutureWarning` noting that in 26.06 this will start to error, as well as the future error message. When run under `cuml.accel` we instead error right away, helping us improve sklearn compatibility right away when in `cuml.accel`. These `FutureWarning`s will mainly show up if you fit on a dataframe with columns (say `["a", "b"]`) that don't match the columns of a _dataframe_ provided during inference (say `["x", "y"]`). In the future that will error as the features names don't match between fit and inference times. Because of this new `FutureWarning` (and soon to be error) I'm marking this PR as "breaking" as per our policy. Note that users fitting on a dataframe and inferring on an array (or vis-versa) will see a `UserWarning` in those cases. This matches sklearn behavior. This warning will _not_ turn into an error in a future release, these warnings will remain warnings. Fixes #6650. Fixes #5677. Fixes #6498. Part of #7428 Authors: - Jim Crist-Harif (https://github.com/jcrist) Approvers: - Simon Adorf (https://github.com/csadorf) URL: #7877
1 parent 868c4cf commit 810180a

File tree

16 files changed

+309
-217
lines changed

16 files changed

+309
-217
lines changed

python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
import cuml
3737
from cuml.internals.array_sparse import SparseCumlArray
3838
from cuml.internals.global_settings import _global_settings_data
39-
from cuml.internals.validation import check_is_fitted
39+
from cuml.internals.validation import check_is_fitted, check_features
4040

4141
from ....thirdparty_adapters import check_array
4242
from ..preprocessing._function_transformer import FunctionTransformer
@@ -880,13 +880,6 @@ def fit_transform(self, X, y=None) -> SparseCumlArray:
880880
sparse matrices.
881881
882882
"""
883-
# TODO: this should be `feature_names_in_` when we start having it
884-
if hasattr(X, "columns"):
885-
self._feature_names_in = cpu_np.asarray(X.columns)
886-
else:
887-
self._feature_names_in = None
888-
# set n_features_in_ attribute
889-
self._check_n_features(X, reset=True)
890883
self._validate_transformers()
891884
self._validate_column_callables(X)
892885
self._validate_remainder(X)
@@ -935,19 +928,7 @@ def transform(self, X) -> SparseCumlArray:
935928
936929
"""
937930
check_is_fitted(self)
938-
if hasattr(X, "columns"):
939-
X_feature_names = cpu_np.asarray(X.columns)
940-
else:
941-
X_feature_names = None
942-
943-
self._check_n_features(X, reset=False)
944-
if (self._feature_names_in is not None and
945-
X_feature_names is not None and
946-
cpu_np.any(self._feature_names_in != X_feature_names)):
947-
raise RuntimeError(
948-
"Given feature/column names do not match the ones for the "
949-
"data given during fit."
950-
)
931+
check_features(self, X)
951932
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
952933
self._validate_output(Xs)
953934

python/cuml/cuml/_thirdparty/sklearn/utils/skl_dependencies.py

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515

1616
from cuml.internals.array_sparse import SparseCumlArray
17+
from cuml.internals.validation import check_features
1718

1819
from ....internals.base import Base
1920
from ....thirdparty_adapters import check_array
@@ -41,35 +42,6 @@ def init(self, *args, **kwargs):
4142

4243
cls.__init__ = init
4344

44-
def _check_n_features(self, X, reset):
45-
"""Set the `n_features_in_` attribute, or check against it.
46-
47-
Parameters
48-
----------
49-
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
50-
The input samples.
51-
reset : bool
52-
If True, the `n_features_in_` attribute is set to `X.shape[1]`.
53-
Else, the attribute must already exist and the function checks
54-
that it is equal to `X.shape[1]`.
55-
"""
56-
n_features = X.shape[1]
57-
58-
if reset:
59-
self.n_features_in_ = n_features
60-
else:
61-
if not hasattr(self, 'n_features_in_'):
62-
raise RuntimeError(
63-
"The reset parameter is False but there is no "
64-
"n_features_in_ attribute. Is this estimator fitted?"
65-
)
66-
if n_features != self.n_features_in_:
67-
raise ValueError(
68-
'X has {} features, but {} is expecting {} features '
69-
'as input.'.format(n_features, self.__class__.__name__,
70-
self.n_features_in_)
71-
)
72-
7345
def _validate_data(self, X, y=None, reset=True,
7446
validate_separately=False, **check_params):
7547
"""Validate input data and set or check the `n_features_in_` attribute.
@@ -100,6 +72,12 @@ def _validate_data(self, X, y=None, reset=True,
10072
out : {ndarray, sparse matrix} or tuple of these
10173
The validated input. A tuple is returned if `y` is not None.
10274
"""
75+
if check_params.get('ensure_2d', True) and not reset:
76+
# The `reset=True` case is always handled by the mandatory
77+
# `reflect(reset=True)` decorators currently. To avoid
78+
# duplicate calls, we avoid `check_features(self, X, reset=True)`
79+
# for now.
80+
check_features(self, X)
10381

10482
if y is None:
10583
if self._get_tags()['requires_y']:
@@ -122,9 +100,6 @@ def _validate_data(self, X, y=None, reset=True,
122100
X, y = check_X_y(X, y, **check_params)
123101
out = X, y
124102

125-
if check_params.get('ensure_2d', True):
126-
self._check_n_features(X, reset=reset)
127-
128103
return out
129104

130105

python/cuml/cuml/accel/_overrides/sklearn/preprocessing.py

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -121,46 +121,14 @@ class TargetEncoder(ProxyBase):
121121
_gpu_class = cuml.preprocessing.TargetEncoder
122122

123123
def _gpu_fit(self, X, y, **kwargs):
124-
"""Fit with independent mode for sklearn compatibility.
125-
126-
sklearn's TargetEncoder always encodes features independently,
127-
so we force independent mode when using cuml.accel.
128-
"""
129124
# Check for unsupported inputs (triggers CPU fallback)
130125
_check_unsupported_inputs(X, y, self._cpu)
131-
132-
# Ensure independent mode is set for sklearn compatibility
133-
self._gpu.multi_feature_mode = "independent"
134-
result = self._gpu.fit(X, y, **kwargs)
135-
136-
# Sync sklearn-expected attributes to the proxy
137-
if hasattr(self._gpu, "feature_names_in_"):
138-
self.feature_names_in_ = self._gpu.feature_names_in_
139-
if hasattr(self._gpu, "n_features_in_"):
140-
self.n_features_in_ = self._gpu.n_features_in_
141-
142-
return result
126+
return self._gpu.fit(X, y, **kwargs)
143127

144128
def _gpu_fit_transform(self, X, y, **kwargs):
145-
"""Fit-transform with independent mode for sklearn compatibility.
146-
147-
sklearn's TargetEncoder always encodes features independently,
148-
so we force independent mode when using cuml.accel.
149-
"""
150129
# Check for unsupported inputs (triggers CPU fallback)
151130
_check_unsupported_inputs(X, y, self._cpu)
152-
153-
# Ensure independent mode is set for sklearn compatibility
154-
self._gpu.multi_feature_mode = "independent"
155-
result = self._gpu.fit_transform(X, y, **kwargs)
156-
157-
# Sync sklearn-expected attributes to the proxy
158-
if hasattr(self._gpu, "feature_names_in_"):
159-
self.feature_names_in_ = self._gpu.feature_names_in_
160-
if hasattr(self._gpu, "n_features_in_"):
161-
self.n_features_in_ = self._gpu.n_features_in_
162-
163-
return result
131+
return self._gpu.fit_transform(X, y, **kwargs)
164132

165133
def _gpu_get_feature_names_out(self, input_features=None):
166134
"""Return feature names for output features.

python/cuml/cuml/accel/estimator_proxy.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -422,12 +422,9 @@ def __getattr__(self, name: str) -> Any:
422422
try:
423423
return getattr(self._cpu, name)
424424
except AttributeError:
425-
# We special case `feature_names_in_` here since it's the only common
426-
# fitted attribute that cuml doesn't support anywhere.
427-
if (
428-
name in self._not_implemented_attributes
429-
or name == "feature_names_in_"
430-
) and is_fitted(self._cpu):
425+
if name in self._not_implemented_attributes and is_fitted(
426+
self._cpu
427+
):
431428
raise AttributeError(
432429
f"The `{type(self).__name__}.{name}` attribute is not yet "
433430
"implemented in `cuml.accel`.\n\n"

python/cuml/cuml/internals/interop.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
22
# SPDX-License-Identifier: Apache-2.0
33
from __future__ import annotations
44

@@ -189,15 +189,11 @@ def _attrs_to_cpu(self, model) -> dict[str, Any]:
189189
If one or more attributes are unsupported by the CPU model.
190190
"""
191191
out = {}
192-
if (
193-
n_features_in_ := getattr(self, "n_features_in_", None)
194-
) is not None:
195-
out["n_features_in_"] = n_features_in_
196-
197-
# TODO: Some cuml estimators set `feature_names_in_`, but they don't
198-
# do this properly per sklearn conventions. For now we skip forwarding
199-
# feature_names_in_ to CPU. Revisit once
200-
# https://github.com/rapidsai/cuml/issues/6650 is resolved.
192+
for name in ["n_features_in_", "feature_names_in_"]:
193+
try:
194+
out[name] = getattr(self, name)
195+
except AttributeError:
196+
pass
201197
return out
202198

203199
def _sync_attrs_to_cpu(self, model) -> None:

python/cuml/cuml/internals/validation.py

Lines changed: 121 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
import numbers
6+
import warnings
67

8+
import cudf
79
import cupy as cp
810
import numpy as np
11+
import pandas as pd
912
from sklearn.utils.validation import check_is_fitted
1013

1114
__all__ = (
@@ -84,8 +87,63 @@ def _get_n_features(X):
8487
return shape[1] if len(shape) >= 2 else 1
8588

8689

90+
def _warn_or_error(exc_cls, msg):
91+
"""Errors if running in cuml.accel, otherwise warns that an error will be
92+
raised in the future."""
93+
import cuml.accel
94+
95+
if cuml.accel.enabled():
96+
raise exc_cls(msg)
97+
else:
98+
warnings.warn(
99+
"cuml is adding support for `feature_names_in_` for validating "
100+
"the feature names of dataframe-like inputs. In cuml 26.06 this "
101+
f"will error with the following message:\n\n{msg}",
102+
FutureWarning,
103+
)
104+
105+
106+
def _get_feature_names(X):
107+
"""Get feature names from X.
108+
109+
Returns
110+
-------
111+
names: ndarray or None
112+
Feature names of `X`. Unrecognized array containers will return `None`.
113+
"""
114+
if isinstance(X, (pd.DataFrame, cudf.DataFrame)):
115+
feature_names = np.asarray(X.columns, dtype=object)
116+
elif hasattr(X, "__dataframe__"):
117+
feature_names = np.asarray(
118+
list(X.__dataframe__().column_names()), dtype=object
119+
)
120+
else:
121+
return None
122+
123+
if len(feature_names) == 0:
124+
# No features, just return None
125+
return None
126+
127+
# Check the types of the column names.
128+
types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
129+
if len(types) == 1 and types[0] == "str":
130+
return feature_names
131+
elif len(types) > 1 and "str" in types:
132+
msg = (
133+
"Feature names are only supported if all input features have string names, "
134+
f"but your input has {types} as feature name / column name types. "
135+
"If you want feature names to be stored and validated, you must convert "
136+
"them all to strings, by using X.columns = X.columns.astype(str) for "
137+
"example. Otherwise you can remove feature / column names from your input "
138+
"data, or convert them all to a non-string data type."
139+
)
140+
_warn_or_error(TypeError, msg)
141+
142+
return None
143+
144+
87145
def check_features(estimator, X, reset=False) -> None:
88-
"""Check or set ``n_features_in_``.
146+
"""Check or set ``n_features_in_`` and ``feature_names_in_``.
89147
90148
Parameters
91149
----------
@@ -95,17 +153,71 @@ def check_features(estimator, X, reset=False) -> None:
95153
The original user-provided `X` input. No conversion or processing steps
96154
should have occurred to this array yet.
97155
reset : bool, default=False
98-
If true, ``n_features_in_`` is set on ``estimator`` to match ``X``.
99-
Otherwise the ``X`` is checked to match the existing
100-
``n_features_in_``. ``reset=True`` should be used for fit-like methods.
156+
If True, ``n_features_in_`` and ``feature_names_in_`` are set on
157+
``estimator`` to match ``X``. Otherwise ``X`` is checked to match the
158+
existing ``n_features_in_`` and ``feature_names_in_``. ``reset=True``
159+
should be used for fit-like methods, and False otherwise.
101160
"""
102161
n_features = _get_n_features(X)
162+
feature_names = _get_feature_names(X)
103163

104164
if reset:
105165
estimator.n_features_in_ = n_features
106-
else:
107-
if n_features != estimator.n_features_in_:
108-
raise ValueError(
109-
f"X has {n_features} features, but {estimator.__class__.__name__} "
110-
f"is expecting {estimator.n_features_in_} features as input."
166+
if feature_names is not None:
167+
estimator.feature_names_in_ = feature_names
168+
elif hasattr(estimator, "feature_names_in_"):
169+
# Clear old feature names if present
170+
delattr(estimator, "feature_names_in_")
171+
return
172+
173+
est_feature_names = getattr(estimator, "feature_names_in_", None)
174+
175+
# Check feature_names_in_ first
176+
if est_feature_names is not None or feature_names is not None:
177+
if est_feature_names is None:
178+
warnings.warn(
179+
f"X has feature names, but {estimator.__class__.__name__} was fitted "
180+
"without feature names"
111181
)
182+
183+
elif feature_names is None:
184+
warnings.warn(
185+
"X does not have valid feature names, but"
186+
f" {estimator.__class__.__name__} was fitted with feature names"
187+
)
188+
189+
elif len(est_feature_names) != len(feature_names) or np.any(
190+
est_feature_names != feature_names
191+
):
192+
unexpected = sorted(
193+
set(feature_names).difference(est_feature_names)
194+
)
195+
missing = sorted(set(est_feature_names).difference(feature_names))
196+
197+
parts = [
198+
"The feature names should match those that were passed during fit."
199+
]
200+
for heading, names in [
201+
("Feature names unseen at fit time:", unexpected),
202+
("Feature names seen at fit time, yet now missing:", missing),
203+
]:
204+
if names:
205+
parts.append(heading)
206+
parts.extend([f"- {name}" for name in names[:5]])
207+
if len(names) > 5:
208+
parts.append("- ...")
209+
210+
if not missing and not unexpected:
211+
parts.append(
212+
"Feature names must be in the same order as they were in fit."
213+
)
214+
215+
msg = "\n".join(parts)
216+
_warn_or_error(ValueError, msg)
217+
218+
# Then check n_features_in_
219+
if n_features != estimator.n_features_in_:
220+
raise ValueError(
221+
f"X has {n_features} features, but {estimator.__class__.__name__} "
222+
f"is expecting {estimator.n_features_in_} features as input."
223+
)

python/cuml/cuml/neighbors/kernel_density.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ def _sync_attrs_to_cpu(self, model):
248248
else cp.asnumpy(self._sample_weight)
249249
)
250250
model.fit(X, sample_weight=sample_weight)
251+
if hasattr(self, "feature_names_in_"):
252+
model.feature_names_in_ = self.feature_names_in_
251253

252254
def __init__(
253255
self,

0 commit comments

Comments
 (0)