Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/modules/applicability_domain.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ Classes for checking applicability domain.
HotellingT2TestADChecker
LeverageADChecker
PCABoundingBoxADChecker
ProbStdADChecker
TOPKATADChecker
2 changes: 2 additions & 0 deletions skfp/applicability_domain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
from .hotelling_t2_test import HotellingT2TestADChecker
from .leverage import LeverageADChecker
from .pca_bounding_box import PCABoundingBoxADChecker
from .prob_std import ProbStdADChecker
from .topkat import TOPKATADChecker
182 changes: 182 additions & 0 deletions skfp/applicability_domain/prob_std.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
from numbers import Real

import numpy as np
from scipy.stats import norm
from sklearn.base import is_classifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils._param_validation import Interval, InvalidParameterError
from sklearn.utils.validation import check_is_fitted, validate_data

from skfp.bases.base_ad_checker import BaseADChecker


class ProbStdADChecker(BaseADChecker):
"""
Probabilistic standard deviation method (PROB-STD).

Defines applicability domain based on the probabilistic interpretation of prediction
uncertainty from individual estimators in an ensemble model [1]_. For each sample,
the mean and standard deviation of the ensemble predictions are used to construct
a normal distribution. The score is defined as the probability mass under this
distribution that lies on the wrong side of the classification threshold (0.5).

This approach supports both regression models (using ``.predict(X)``) and binary classifiers
(using ``.predict_proba(X)`` and the probability of the positive class). For regression models,
the outputs should be interpretable as positive-class probabilities in [0, 1], e.g. when the
regressor is trained on binary targets. The ensemble model must expose the ``estimators_``
attribute. If no model is provided, a default :class:`~sklearn.ensemble.RandomForestRegressor`
is created and trained during :meth:`fit`.

At prediction time, each sample is passed to all estimators, and their predictions
(or predicted probabilities for classifiers) are used to construct the distribution.
The sample is considered in-domain if the resulting probability of misclassification
(PROB-STD) is lower than or equal to the specified threshold.

Parameters
----------
model : object, default=None
Fitted ensemble model with accessible ``estimators_`` attribute and
either ``.predict(X)`` or ``.predict_proba(X)`` method on each sub-estimator.
If not provided, a default :class:`~sklearn.ensemble.RandomForestRegressor` will
be created. Note that if you pass a fitted model here, call to :meth:`fit` is
not necessary, but it will perform model validation.

threshold : float, default=0.1
Maximum allowed probability of incorrect class assignment.
Lower values yield a stricter applicability domain.

n_jobs : int, default=None
The number of jobs to run in parallel. :meth:`transform_x_y` and
:meth:`transform` are parallelized over the input molecules. ``None`` means 1
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all
processors. See scikit-learn documentation on ``n_jobs`` for more details.

verbose : int or dict, default=0
Controls the verbosity when filtering molecules.
If a dictionary is passed, it is treated as kwargs for ``tqdm()``,
and can be used to control the progress bar.

References
----------
.. [1] `Klingspohn, W., Mathea, M., ter Laak, A. et al.
"Efficiency of different measures for defining the applicability
domain of classification models."
Journal of Cheminformatics 9, 44 (2017).
<https://doi.org/10.1186/s13321-017-0230-2>`_

Examples
--------
>>> import numpy as np
>>> from sklearn.ensemble import RandomForestRegressor
>>> from skfp.applicability_domain import ProbStdADChecker
>>> X_train = np.random.uniform(0, 1, size=(1000, 5))
>>> y_train = (X_train[:, 0] + X_train[:, 1] > 1).astype(float)
>>> model = RandomForestRegressor(n_estimators=10, random_state=0)
>>> model.fit(X_train, y_train)
>>> probstd_ad_checker = ProbStdADChecker(model=model, threshold=0.1)
>>> probstd_ad_checker.fit()
>>> probstd_ad_checker
ProbStdADChecker(model=RandomForestRegressor(...), threshold=0.1)

>>> X_test = np.random.uniform(0, 1, size=(100, 5))
>>> probstd_ad_checker.predict(X_test).shape
(100,)
"""

_parameter_constraints: dict = {
**BaseADChecker._parameter_constraints,
"model": [object, None],
"threshold": [Interval(Real, 0, 0.5, closed="left")],
}

def __init__(
self,
model: object | None = None,
threshold: float = 0.1,
n_jobs: int | None = None,
verbose: int | dict = 0,
):
super().__init__(
n_jobs=n_jobs,
verbose=verbose,
)
self.model = model
self.threshold = threshold

def _validate_params(self):
super()._validate_params()

if self.model is not None and is_classifier(self.model):
check_is_fitted(self.model, "classes_")

if not hasattr(self.model, "predict_proba"):
raise InvalidParameterError(
f"{self.__class__.__name__} requires classifiers with .predict_proba() method"
)

if len(getattr(self.model, "classes_", [])) != 2:
raise InvalidParameterError(
f"{self.__class__.__name__} only supports binary classifiers"
)

def fit( # noqa: D102
self,
X: np.ndarray | None = None,
y: np.ndarray | None = None,
):
self._validate_params()

if self.model is None:
X, y = validate_data(self, X, y, ensure_2d=False)
self.model_ = RandomForestRegressor(random_state=0)
self.model_.fit(X, y) # type: ignore[union-attr]
else:
self.model_ = self.model

return self

def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102
prob_std = self._compute_prob_std(X)
return prob_std <= self.threshold

def score_samples(self, X: np.ndarray) -> np.ndarray:
"""
Calculate the applicability domain score of samples.
It is defined as the minimum probabilistic mass under the normal distribution
that lies on either side of the classification threshold (0.5).
Lower values indicate higher confidence in class assignment.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.

Returns
-------
scores : ndarray of shape (n_samples,)
Probabilistic scores reflecting the uncertainty of class assignment.
"""
return self._compute_prob_std(X)

def _compute_prob_std(self, X: np.ndarray) -> np.ndarray:
X = validate_data(self, X=X, reset=False)
if self.model is not None and not hasattr(self, "model_"):
self.model_ = self.model
check_is_fitted(self.model_, "estimators_")
self._validate_params()

if is_classifier(self.model_):
preds = np.array([est.predict_proba(X) for est in self.model_.estimators_])
preds = preds[:, :, 1] # shape: (n_estimators, n_samples)
else:
preds = np.array([est.predict(X) for est in self.model_.estimators_])

preds = preds.T # shape: (n_samples, n_estimators)

y_mean = preds.mean(axis=1)
y_std = preds.std(axis=1)
y_std = np.maximum(y_std, 1e-8)

left_tail = norm.cdf(0.5, loc=y_mean, scale=y_std)
prob_std = np.minimum(left_tail, 1 - left_tail)
return prob_std
160 changes: 160 additions & 0 deletions skfp/applicability_domain/topkat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from numbers import Real

import numpy as np
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import validate_data

from skfp.bases.base_ad_checker import BaseADChecker


class TOPKATADChecker(BaseADChecker):
"""
TOPKAT (Optimal Prediction Space) method.

Defines applicability domain using the Optimal Prediction Space (OPS) approach
introduced in the TOPKAT system [1]_. The method transforms the input feature space
into a normalized, centered and rotated space using PCA on a scaled [-1, 1] version
of the training data (S-space). Each new sample is projected into the same OPS space,
and a weighted distance (dOPS) from the center is computed.

Samples are considered in-domain if their dOPS is below a threshold. By default,
this threshold is computed as :math:`5 * D / (2 * N)`, where:
- :math:``D`` is the number of input features,
- :math:``N`` is the number of training samples.

This method captures both the variance and correlation structure of the descriptors,
and performs well for detecting global outliers in descriptor space.

Parameters
----------
threshold : float, default=None
Optional user-defined threshold for dOPS. If provided, overrides the default
analytical threshold :math:`5 * D / (2 * N)`. Lower values produce stricter domains.

n_jobs : int, default=None
The number of jobs to run in parallel. :meth:`transform_x_y` and
:meth:`transform` are parallelized over the input molecules. ``None`` means 1
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all
processors. See scikit-learn documentation on ``n_jobs`` for more details.

verbose : int or dict, default=0
Controls the verbosity when filtering molecules.
If a dictionary is passed, it is treated as kwargs for ``tqdm()``,
and can be used to control the progress bar.

References
----------
.. [1] Gombar, V. K.
`"Method and apparatus for validation of model-based predictions."
U.S. Patent No. 6,036,349. Washington, DC: U.S. Patent and Trademark Office.
<https://patents.google.com/patent/US6036349A/en>`_

Examples
--------
>>> import numpy as np
>>> from skfp.applicability_domain import TOPKATADChecker
>>> from sklearn.datasets import make_blobs
>>> X_train, _ = make_blobs(n_samples=100, centers=2, n_features=5, random_state=0)
>>> X_test = X_train[:5]

>>> topkat_ad_checker = TOPKATADChecker()
>>> topkat_ad_checker.fit(X_train)
TOPKATADChecker()

>>> topkat_ad_checker.predict(X_test)
array([ True, True, True, True, True])
"""

_parameter_constraints: dict = {
**BaseADChecker._parameter_constraints,
"threshold": [Interval(Real, 0, None, closed="left"), None],
}

def __init__(
self,
threshold: float | None = None,
n_jobs: int | None = None,
verbose: int | dict = 0,
):
super().__init__(
n_jobs=n_jobs,
verbose=verbose,
)
self.threshold = threshold

def fit( # noqa: D102
self,
X: np.ndarray,
y: np.ndarray | None = None, # noqa: ARG002
):
X = validate_data(self, X=X)

self.X_min_ = X.min(axis=0)
self.X_max_ = X.max(axis=0)
self.range_ = self.X_max_ - self.X_min_
self.num_points = X.shape[0]
self.num_dims = X.shape[1]

# TOPKAT S-space: feature-wise scaling of X to [-1, 1].
# Avoid division by zero: where range==0, denom=1 => scaled value will be 0.
self.denom_ = np.where((self.range_) != 0, (self.range_), 1.0)
S = (2 * X - self.X_max_ - self.X_min_) / self.denom_
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here you can also use range_ right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

range_ is X_max_ - X_min_, but here we essentially have - (X_max + X_min) so we can't directly replace it with range_.


# Augment with bias (1) so the subsequent rotation (PCA) includes the intercept term.
S_bias = np.c_[np.ones(S.shape[0]), S]

cov_matrix = S_bias.T @ S_bias
eigvals, eigvecs = np.linalg.eigh(cov_matrix)
self.eigen_val = np.real(eigvals)
self.eigen_vec = np.real(eigvecs)

return self

def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102
dOPS = self._compute_dops(X)

threshold = self.threshold
if threshold is None:
threshold = (5 * self.num_dims) / (2 * self.num_points)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One empty line after if for readability


return dOPS < threshold

def score_samples(self, X: np.ndarray) -> np.ndarray:
"""
Calculate the applicability domain score of samples.
It is defined as the weighted distance (dOPS) from
the center of the training data in the OPS-transformed space.
Lower values indicate higher similarity to the training data.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.

Returns
-------
scores : ndarray of shape (n_samples,)
Distance scores from the center of the Optimal Prediction Space (dOPS).
"""
return self._compute_dops(X)

def _compute_dops(self, X: np.ndarray) -> np.ndarray:
X = validate_data(self, X=X, reset=False)

# Apply the same S-space transform as in fit().
Ssample = (2 * X - self.X_max_ - self.X_min_) / self.denom_
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.range_ can be used here, right?


# Add bias term to match the augmented space used to compute eigenvectors.
Ssample_bias = np.c_[np.ones(Ssample.shape[0]), Ssample]

# Project to OPS space
OPS_sample = Ssample_bias @ self.eigen_vec

inv_eigval = np.divide(
1.0,
self.eigen_val,
out=np.zeros_like(self.eigen_val),
where=self.eigen_val != 0,
)
dOPS = np.sum((OPS_sample**2) * inv_eigval, axis=1)
return dOPS
Loading
Loading