|
| 1 | +from numbers import Real |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +from scipy.stats import norm |
| 5 | +from sklearn.base import is_classifier |
| 6 | +from sklearn.ensemble import RandomForestRegressor |
| 7 | +from sklearn.utils._param_validation import Interval, InvalidParameterError |
| 8 | +from sklearn.utils.validation import check_is_fitted, validate_data |
| 9 | + |
| 10 | +from skfp.bases.base_ad_checker import BaseADChecker |
| 11 | + |
| 12 | + |
| 13 | +class ProbStdADChecker(BaseADChecker): |
| 14 | + """ |
| 15 | + Probabilistic standard deviation method (PROB-STD). |
| 16 | +
|
| 17 | + Defines applicability domain based on the probabilistic interpretation of prediction |
| 18 | + uncertainty from individual estimators in an ensemble model [1]_. For each sample, |
| 19 | + the mean and standard deviation of the ensemble predictions are used to construct |
| 20 | + a normal distribution. The score is defined as the probability mass under this |
| 21 | + distribution that lies on the wrong side of the classification threshold (0.5). |
| 22 | +
|
| 23 | + This approach supports both regression models (using ``.predict(X)``) and binary classifiers |
| 24 | + (using ``.predict_proba(X)`` and the probability of the positive class). For regression models, |
| 25 | + the outputs should be interpretable as positive-class probabilities in [0, 1], e.g. when the |
| 26 | + regressor is trained on binary targets. The ensemble model must expose the ``estimators_`` |
| 27 | + attribute. If no model is provided, a default :class:`~sklearn.ensemble.RandomForestRegressor` |
| 28 | + is created and trained during :meth:`fit`. |
| 29 | +
|
| 30 | + At prediction time, each sample is passed to all estimators, and their predictions |
| 31 | + (or predicted probabilities for classifiers) are used to construct the distribution. |
| 32 | + The sample is considered in-domain if the resulting probability of misclassification |
| 33 | + (PROB-STD) is lower than or equal to the specified threshold. |
| 34 | +
|
| 35 | + Parameters |
| 36 | + ---------- |
| 37 | + model : object, default=None |
| 38 | + Fitted ensemble model with accessible ``estimators_`` attribute and |
| 39 | + either ``.predict(X)`` or ``.predict_proba(X)`` method on each sub-estimator. |
| 40 | + If not provided, a default :class:`~sklearn.ensemble.RandomForestRegressor` will |
| 41 | + be created. Note that if you pass a fitted model here, call to :meth:`fit` is |
| 42 | + not necessary, but it will perform model validation. |
| 43 | +
|
| 44 | + threshold : float, default=0.1 |
| 45 | + Maximum allowed probability of incorrect class assignment. |
| 46 | + Lower values yield a stricter applicability domain. |
| 47 | +
|
| 48 | + n_jobs : int, default=None |
| 49 | + The number of jobs to run in parallel. :meth:`transform_x_y` and |
| 50 | + :meth:`transform` are parallelized over the input molecules. ``None`` means 1 |
| 51 | + unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all |
| 52 | + processors. See scikit-learn documentation on ``n_jobs`` for more details. |
| 53 | +
|
| 54 | + verbose : int or dict, default=0 |
| 55 | + Controls the verbosity when filtering molecules. |
| 56 | + If a dictionary is passed, it is treated as kwargs for ``tqdm()``, |
| 57 | + and can be used to control the progress bar. |
| 58 | +
|
| 59 | + References |
| 60 | + ---------- |
| 61 | + .. [1] `Klingspohn, W., Mathea, M., ter Laak, A. et al. |
| 62 | + "Efficiency of different measures for defining the applicability |
| 63 | + domain of classification models." |
| 64 | + Journal of Cheminformatics 9, 44 (2017). |
| 65 | + <https://doi.org/10.1186/s13321-017-0230-2>`_ |
| 66 | +
|
| 67 | + Examples |
| 68 | + -------- |
| 69 | + >>> import numpy as np |
| 70 | + >>> from sklearn.ensemble import RandomForestRegressor |
| 71 | + >>> from skfp.applicability_domain import ProbStdADChecker |
| 72 | + >>> X_train = np.random.uniform(0, 1, size=(1000, 5)) |
| 73 | + >>> y_train = (X_train[:, 0] + X_train[:, 1] > 1).astype(float) |
| 74 | + >>> model = RandomForestRegressor(n_estimators=10, random_state=0) |
| 75 | + >>> model.fit(X_train, y_train) |
| 76 | + >>> probstd_ad_checker = ProbStdADChecker(model=model, threshold=0.1) |
| 77 | + >>> probstd_ad_checker.fit() |
| 78 | + >>> probstd_ad_checker |
| 79 | + ProbStdADChecker(model=RandomForestRegressor(...), threshold=0.1) |
| 80 | +
|
| 81 | + >>> X_test = np.random.uniform(0, 1, size=(100, 5)) |
| 82 | + >>> probstd_ad_checker.predict(X_test).shape |
| 83 | + (100,) |
| 84 | + """ |
| 85 | + |
| 86 | + _parameter_constraints: dict = { |
| 87 | + **BaseADChecker._parameter_constraints, |
| 88 | + "model": [object, None], |
| 89 | + "threshold": [Interval(Real, 0, 0.5, closed="left")], |
| 90 | + } |
| 91 | + |
| 92 | + def __init__( |
| 93 | + self, |
| 94 | + model: object | None = None, |
| 95 | + threshold: float = 0.1, |
| 96 | + n_jobs: int | None = None, |
| 97 | + verbose: int | dict = 0, |
| 98 | + ): |
| 99 | + super().__init__( |
| 100 | + n_jobs=n_jobs, |
| 101 | + verbose=verbose, |
| 102 | + ) |
| 103 | + self.model = model |
| 104 | + self.threshold = threshold |
| 105 | + |
| 106 | + def _validate_params(self): |
| 107 | + super()._validate_params() |
| 108 | + |
| 109 | + if self.model is not None and is_classifier(self.model): |
| 110 | + check_is_fitted(self.model, "classes_") |
| 111 | + |
| 112 | + if not hasattr(self.model, "predict_proba"): |
| 113 | + raise InvalidParameterError( |
| 114 | + f"{self.__class__.__name__} requires classifiers with .predict_proba() method" |
| 115 | + ) |
| 116 | + |
| 117 | + if len(getattr(self.model, "classes_", [])) != 2: |
| 118 | + raise InvalidParameterError( |
| 119 | + f"{self.__class__.__name__} only supports binary classifiers" |
| 120 | + ) |
| 121 | + |
| 122 | + def fit( # noqa: D102 |
| 123 | + self, |
| 124 | + X: np.ndarray | None = None, |
| 125 | + y: np.ndarray | None = None, |
| 126 | + ): |
| 127 | + self._validate_params() |
| 128 | + |
| 129 | + if self.model is None: |
| 130 | + X, y = validate_data(self, X, y, ensure_2d=False) |
| 131 | + self.model_ = RandomForestRegressor(random_state=0) |
| 132 | + self.model_.fit(X, y) # type: ignore[union-attr] |
| 133 | + else: |
| 134 | + self.model_ = self.model |
| 135 | + |
| 136 | + return self |
| 137 | + |
| 138 | + def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102 |
| 139 | + prob_std = self._compute_prob_std(X) |
| 140 | + return prob_std <= self.threshold |
| 141 | + |
| 142 | + def score_samples(self, X: np.ndarray) -> np.ndarray: |
| 143 | + """ |
| 144 | + Calculate the applicability domain score of samples. |
| 145 | + It is defined as the minimum probabilistic mass under the normal distribution |
| 146 | + that lies on either side of the classification threshold (0.5). |
| 147 | + Lower values indicate higher confidence in class assignment. |
| 148 | +
|
| 149 | + Parameters |
| 150 | + ---------- |
| 151 | + X : array-like of shape (n_samples, n_features) |
| 152 | + The data matrix. |
| 153 | +
|
| 154 | + Returns |
| 155 | + ------- |
| 156 | + scores : ndarray of shape (n_samples,) |
| 157 | + Probabilistic scores reflecting the uncertainty of class assignment. |
| 158 | + """ |
| 159 | + return self._compute_prob_std(X) |
| 160 | + |
| 161 | + def _compute_prob_std(self, X: np.ndarray) -> np.ndarray: |
| 162 | + X = validate_data(self, X=X, reset=False) |
| 163 | + if self.model is not None and not hasattr(self, "model_"): |
| 164 | + self.model_ = self.model |
| 165 | + check_is_fitted(self.model_, "estimators_") |
| 166 | + self._validate_params() |
| 167 | + |
| 168 | + if is_classifier(self.model_): |
| 169 | + preds = np.array([est.predict_proba(X) for est in self.model_.estimators_]) |
| 170 | + preds = preds[:, :, 1] # shape: (n_estimators, n_samples) |
| 171 | + else: |
| 172 | + preds = np.array([est.predict(X) for est in self.model_.estimators_]) |
| 173 | + |
| 174 | + preds = preds.T # shape: (n_samples, n_estimators) |
| 175 | + |
| 176 | + y_mean = preds.mean(axis=1) |
| 177 | + y_std = preds.std(axis=1) |
| 178 | + y_std = np.maximum(y_std, 1e-8) |
| 179 | + |
| 180 | + left_tail = norm.cdf(0.5, loc=y_mean, scale=y_std) |
| 181 | + prob_std = np.minimum(left_tail, 1 - left_tail) |
| 182 | + return prob_std |
0 commit comments