Skip to content

Commit db8de97

Browse files
Kacper-KozubowskiKacper Kozubowskij-adamczyk
authored
Add TOPKAT and PROB-STD methods with tests (#480)
Co-authored-by: Kacper Kozubowski <agapanthus133@gmail.com> Co-authored-by: Jakub Adamczyk <jakubadamczyk10@gmail.com>
1 parent ac8ec93 commit db8de97

File tree

7 files changed

+557
-2
lines changed

7 files changed

+557
-2
lines changed

.github/workflows/tests.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
name: Cache venv
3434
with:
3535
path: ./.venv
36-
key: ${{ matrix.os }}-venv-${{ hashFiles('**/uv.lock') }}
36+
key: ${{ matrix.os }}-venv-${{ matrix.python-version }}-${{ hashFiles('**/uv.lock') }}
3737

3838
- uses: actions/cache@v4
3939
name: Cache datasets
@@ -42,7 +42,8 @@ jobs:
4242
key: datasets
4343

4444
- name: Install the project dependencies
45-
run: uv sync --group test
45+
run: uv sync --group test --python "$(python -c 'import sys; print(sys.executable)')"
46+
shell: bash
4647

4748
- name: Check pre-commit
4849
run: uv run pre-commit run --all-files

docs/modules/applicability_domain.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,7 @@ Classes for checking applicability domain.
2121
KNNADChecker
2222
LeverageADChecker
2323
PCABoundingBoxADChecker
24+
ProbStdADChecker
2425
ResponseVariableRangeADChecker
2526
StandardDeviationADChecker
27+
TOPKATADChecker

skfp/applicability_domain/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@
55
from .knn import KNNADChecker
66
from .leverage import LeverageADChecker
77
from .pca_bounding_box import PCABoundingBoxADChecker
8+
from .prob_std import ProbStdADChecker
89
from .response_variable_range import ResponseVariableRangeADChecker
910
from .standard_deviation import StandardDeviationADChecker
11+
from .topkat import TOPKATADChecker
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
from numbers import Real
2+
3+
import numpy as np
4+
from scipy.stats import norm
5+
from sklearn.base import is_classifier
6+
from sklearn.ensemble import RandomForestRegressor
7+
from sklearn.utils._param_validation import Interval, InvalidParameterError
8+
from sklearn.utils.validation import check_is_fitted, validate_data
9+
10+
from skfp.bases.base_ad_checker import BaseADChecker
11+
12+
13+
class ProbStdADChecker(BaseADChecker):
14+
"""
15+
Probabilistic standard deviation method (PROB-STD).
16+
17+
Defines applicability domain based on the probabilistic interpretation of prediction
18+
uncertainty from individual estimators in an ensemble model [1]_. For each sample,
19+
the mean and standard deviation of the ensemble predictions are used to construct
20+
a normal distribution. The score is defined as the probability mass under this
21+
distribution that lies on the wrong side of the classification threshold (0.5).
22+
23+
This approach supports both regression models (using ``.predict(X)``) and binary classifiers
24+
(using ``.predict_proba(X)`` and the probability of the positive class). For regression models,
25+
the outputs should be interpretable as positive-class probabilities in [0, 1], e.g. when the
26+
regressor is trained on binary targets. The ensemble model must expose the ``estimators_``
27+
attribute. If no model is provided, a default :class:`~sklearn.ensemble.RandomForestRegressor`
28+
is created and trained during :meth:`fit`.
29+
30+
At prediction time, each sample is passed to all estimators, and their predictions
31+
(or predicted probabilities for classifiers) are used to construct the distribution.
32+
The sample is considered in-domain if the resulting probability of misclassification
33+
(PROB-STD) is lower than or equal to the specified threshold.
34+
35+
Parameters
36+
----------
37+
model : object, default=None
38+
Fitted ensemble model with accessible ``estimators_`` attribute and
39+
either ``.predict(X)`` or ``.predict_proba(X)`` method on each sub-estimator.
40+
If not provided, a default :class:`~sklearn.ensemble.RandomForestRegressor` will
41+
be created. Note that if you pass a fitted model here, call to :meth:`fit` is
42+
not necessary, but it will perform model validation.
43+
44+
threshold : float, default=0.1
45+
Maximum allowed probability of incorrect class assignment.
46+
Lower values yield a stricter applicability domain.
47+
48+
n_jobs : int, default=None
49+
The number of jobs to run in parallel. :meth:`transform_x_y` and
50+
:meth:`transform` are parallelized over the input molecules. ``None`` means 1
51+
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all
52+
processors. See scikit-learn documentation on ``n_jobs`` for more details.
53+
54+
verbose : int or dict, default=0
55+
Controls the verbosity when filtering molecules.
56+
If a dictionary is passed, it is treated as kwargs for ``tqdm()``,
57+
and can be used to control the progress bar.
58+
59+
References
60+
----------
61+
.. [1] `Klingspohn, W., Mathea, M., ter Laak, A. et al.
62+
"Efficiency of different measures for defining the applicability
63+
domain of classification models."
64+
Journal of Cheminformatics 9, 44 (2017).
65+
<https://doi.org/10.1186/s13321-017-0230-2>`_
66+
67+
Examples
68+
--------
69+
>>> import numpy as np
70+
>>> from sklearn.ensemble import RandomForestRegressor
71+
>>> from skfp.applicability_domain import ProbStdADChecker
72+
>>> X_train = np.random.uniform(0, 1, size=(1000, 5))
73+
>>> y_train = (X_train[:, 0] + X_train[:, 1] > 1).astype(float)
74+
>>> model = RandomForestRegressor(n_estimators=10, random_state=0)
75+
>>> model.fit(X_train, y_train)
76+
>>> probstd_ad_checker = ProbStdADChecker(model=model, threshold=0.1)
77+
>>> probstd_ad_checker.fit()
78+
>>> probstd_ad_checker
79+
ProbStdADChecker(model=RandomForestRegressor(...), threshold=0.1)
80+
81+
>>> X_test = np.random.uniform(0, 1, size=(100, 5))
82+
>>> probstd_ad_checker.predict(X_test).shape
83+
(100,)
84+
"""
85+
86+
_parameter_constraints: dict = {
87+
**BaseADChecker._parameter_constraints,
88+
"model": [object, None],
89+
"threshold": [Interval(Real, 0, 0.5, closed="left")],
90+
}
91+
92+
def __init__(
93+
self,
94+
model: object | None = None,
95+
threshold: float = 0.1,
96+
n_jobs: int | None = None,
97+
verbose: int | dict = 0,
98+
):
99+
super().__init__(
100+
n_jobs=n_jobs,
101+
verbose=verbose,
102+
)
103+
self.model = model
104+
self.threshold = threshold
105+
106+
def _validate_params(self):
107+
super()._validate_params()
108+
109+
if self.model is not None and is_classifier(self.model):
110+
check_is_fitted(self.model, "classes_")
111+
112+
if not hasattr(self.model, "predict_proba"):
113+
raise InvalidParameterError(
114+
f"{self.__class__.__name__} requires classifiers with .predict_proba() method"
115+
)
116+
117+
if len(getattr(self.model, "classes_", [])) != 2:
118+
raise InvalidParameterError(
119+
f"{self.__class__.__name__} only supports binary classifiers"
120+
)
121+
122+
def fit( # noqa: D102
123+
self,
124+
X: np.ndarray | None = None,
125+
y: np.ndarray | None = None,
126+
):
127+
self._validate_params()
128+
129+
if self.model is None:
130+
X, y = validate_data(self, X, y, ensure_2d=False)
131+
self.model_ = RandomForestRegressor(random_state=0)
132+
self.model_.fit(X, y) # type: ignore[union-attr]
133+
else:
134+
self.model_ = self.model
135+
136+
return self
137+
138+
def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102
139+
prob_std = self._compute_prob_std(X)
140+
return prob_std <= self.threshold
141+
142+
def score_samples(self, X: np.ndarray) -> np.ndarray:
143+
"""
144+
Calculate the applicability domain score of samples.
145+
It is defined as the minimum probabilistic mass under the normal distribution
146+
that lies on either side of the classification threshold (0.5).
147+
Lower values indicate higher confidence in class assignment.
148+
149+
Parameters
150+
----------
151+
X : array-like of shape (n_samples, n_features)
152+
The data matrix.
153+
154+
Returns
155+
-------
156+
scores : ndarray of shape (n_samples,)
157+
Probabilistic scores reflecting the uncertainty of class assignment.
158+
"""
159+
return self._compute_prob_std(X)
160+
161+
def _compute_prob_std(self, X: np.ndarray) -> np.ndarray:
162+
X = validate_data(self, X=X, reset=False)
163+
if self.model is not None and not hasattr(self, "model_"):
164+
self.model_ = self.model
165+
check_is_fitted(self.model_, "estimators_")
166+
self._validate_params()
167+
168+
if is_classifier(self.model_):
169+
preds = np.array([est.predict_proba(X) for est in self.model_.estimators_])
170+
preds = preds[:, :, 1] # shape: (n_estimators, n_samples)
171+
else:
172+
preds = np.array([est.predict(X) for est in self.model_.estimators_])
173+
174+
preds = preds.T # shape: (n_samples, n_estimators)
175+
176+
y_mean = preds.mean(axis=1)
177+
y_std = preds.std(axis=1)
178+
y_std = np.maximum(y_std, 1e-8)
179+
180+
left_tail = norm.cdf(0.5, loc=y_mean, scale=y_std)
181+
prob_std = np.minimum(left_tail, 1 - left_tail)
182+
return prob_std
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
from numbers import Real
2+
3+
import numpy as np
4+
from sklearn.utils._param_validation import Interval
5+
from sklearn.utils.validation import validate_data
6+
7+
from skfp.bases.base_ad_checker import BaseADChecker
8+
9+
10+
class TOPKATADChecker(BaseADChecker):
11+
"""
12+
TOPKAT (Optimal Prediction Space) method.
13+
14+
Defines applicability domain using the Optimal Prediction Space (OPS) approach
15+
introduced in the TOPKAT system [1]_. The method transforms the input feature space
16+
into a normalized, centered and rotated space using PCA on a scaled [-1, 1] version
17+
of the training data (S-space). Each new sample is projected into the same OPS space,
18+
and a weighted distance (dOPS) from the center is computed.
19+
20+
Samples are considered in-domain if their dOPS is below a threshold. By default,
21+
this threshold is computed as :math:`5 * D / (2 * N)`, where:
22+
- :math:``D`` is the number of input features,
23+
- :math:``N`` is the number of training samples.
24+
25+
This method captures both the variance and correlation structure of the descriptors,
26+
and performs well for detecting global outliers in descriptor space.
27+
28+
Parameters
29+
----------
30+
threshold : float, default=None
31+
Optional user-defined threshold for dOPS. If provided, overrides the default
32+
analytical threshold :math:`5 * D / (2 * N)`. Lower values produce stricter domains.
33+
34+
n_jobs : int, default=None
35+
The number of jobs to run in parallel. :meth:`transform_x_y` and
36+
:meth:`transform` are parallelized over the input molecules. ``None`` means 1
37+
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all
38+
processors. See scikit-learn documentation on ``n_jobs`` for more details.
39+
40+
verbose : int or dict, default=0
41+
Controls the verbosity when filtering molecules.
42+
If a dictionary is passed, it is treated as kwargs for ``tqdm()``,
43+
and can be used to control the progress bar.
44+
45+
References
46+
----------
47+
.. [1] Gombar, V. K.
48+
`"Method and apparatus for validation of model-based predictions."
49+
U.S. Patent No. 6,036,349. Washington, DC: U.S. Patent and Trademark Office.
50+
<https://patents.google.com/patent/US6036349A/en>`_
51+
52+
Examples
53+
--------
54+
>>> import numpy as np
55+
>>> from skfp.applicability_domain import TOPKATADChecker
56+
>>> from sklearn.datasets import make_blobs
57+
>>> X_train, _ = make_blobs(n_samples=100, centers=2, n_features=5, random_state=0)
58+
>>> X_test = X_train[:5]
59+
60+
>>> topkat_ad_checker = TOPKATADChecker()
61+
>>> topkat_ad_checker.fit(X_train)
62+
TOPKATADChecker()
63+
64+
>>> topkat_ad_checker.predict(X_test)
65+
array([ True, True, True, True, True])
66+
"""
67+
68+
_parameter_constraints: dict = {
69+
**BaseADChecker._parameter_constraints,
70+
"threshold": [Interval(Real, 0, None, closed="left"), None],
71+
}
72+
73+
def __init__(
74+
self,
75+
threshold: float | None = None,
76+
n_jobs: int | None = None,
77+
verbose: int | dict = 0,
78+
):
79+
super().__init__(
80+
n_jobs=n_jobs,
81+
verbose=verbose,
82+
)
83+
self.threshold = threshold
84+
85+
def fit( # noqa: D102
86+
self,
87+
X: np.ndarray,
88+
y: np.ndarray | None = None, # noqa: ARG002
89+
):
90+
X = validate_data(self, X=X)
91+
92+
self.X_min_ = X.min(axis=0)
93+
self.X_max_ = X.max(axis=0)
94+
self.range_ = self.X_max_ - self.X_min_
95+
self.num_points = X.shape[0]
96+
self.num_dims = X.shape[1]
97+
98+
# TOPKAT S-space: feature-wise scaling of X to [-1, 1].
99+
# Avoid division by zero: where range==0, denom=1 => scaled value will be 0.
100+
self.denom_ = np.where((self.range_) != 0, (self.range_), 1.0)
101+
S = (2 * X - self.X_max_ - self.X_min_) / self.denom_
102+
103+
# Augment with bias (1) so the subsequent rotation (PCA) includes the intercept term.
104+
S_bias = np.c_[np.ones(S.shape[0]), S]
105+
106+
cov_matrix = S_bias.T @ S_bias
107+
eigvals, eigvecs = np.linalg.eigh(cov_matrix)
108+
self.eigen_val = np.real(eigvals)
109+
self.eigen_vec = np.real(eigvecs)
110+
111+
return self
112+
113+
def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102
114+
dOPS = self._compute_dops(X)
115+
116+
threshold = self.threshold
117+
if threshold is None:
118+
threshold = (5 * self.num_dims) / (2 * self.num_points)
119+
120+
return dOPS < threshold
121+
122+
def score_samples(self, X: np.ndarray) -> np.ndarray:
123+
"""
124+
Calculate the applicability domain score of samples.
125+
It is defined as the weighted distance (dOPS) from
126+
the center of the training data in the OPS-transformed space.
127+
Lower values indicate higher similarity to the training data.
128+
129+
Parameters
130+
----------
131+
X : array-like of shape (n_samples, n_features)
132+
The data matrix.
133+
134+
Returns
135+
-------
136+
scores : ndarray of shape (n_samples,)
137+
Distance scores from the center of the Optimal Prediction Space (dOPS).
138+
"""
139+
return self._compute_dops(X)
140+
141+
def _compute_dops(self, X: np.ndarray) -> np.ndarray:
142+
X = validate_data(self, X=X, reset=False)
143+
144+
# Apply the same S-space transform as in fit().
145+
Ssample = (2 * X - self.X_max_ - self.X_min_) / self.denom_
146+
147+
# Add bias term to match the augmented space used to compute eigenvectors.
148+
Ssample_bias = np.c_[np.ones(Ssample.shape[0]), Ssample]
149+
150+
# Project to OPS space
151+
OPS_sample = Ssample_bias @ self.eigen_vec
152+
153+
inv_eigval = np.divide(
154+
1.0,
155+
self.eigen_val,
156+
out=np.zeros_like(self.eigen_val),
157+
where=self.eigen_val != 0,
158+
)
159+
dOPS = np.sum((OPS_sample**2) * inv_eigval, axis=1)
160+
return dOPS

0 commit comments

Comments
 (0)