diff --git a/src/coniferest/aadforest.py b/src/coniferest/aadforest.py index f276bc4..8fb94c4 100644 --- a/src/coniferest/aadforest.py +++ b/src/coniferest/aadforest.py @@ -1,8 +1,10 @@ +from abc import abstractmethod from numbers import Real from typing import Callable import numpy as np from scipy.optimize import minimize +from scipy.special import expit, log_expit from .calc_trees import calc_paths_sum, calc_paths_sum_transpose # noqa from .coniferest import Coniferest, ConiferestEvaluator @@ -14,8 +16,114 @@ class AADEvaluator(ConiferestEvaluator): def __init__(self, aad): super(AADEvaluator, self).__init__(aad, map_value=aad.map_value) + + @abstractmethod + def score_samples(self, samples, weights=None): + """ + Evaluate scores for samples. + """ + raise NotImplementedError() + + @abstractmethod + def fit_known(self, data, known_data, known_labels): + """ + Evaluate scores for samples. + """ + raise NotImplementedError() + + +class AADCrossEntropyEvaluator(AADEvaluator): + def __init__(self, aad): + super(AADCrossEntropyEvaluator, self).__init__(aad) + self.weights = np.ones(shape=(self.n_leaves,)) + self.bias = 0.0 # Not sure about 0.0 + + def score_samples(self, x, weights=None): + # Anomaly score is a probability of being REGULAR data. + + if not x.flags["C_CONTIGUOUS"]: + x = np.ascontiguousarray(x) + + if weights is None: + weights = self.weights + + return expit( + calc_paths_sum( + self.selectors, + self.node_offsets, + x, + weights, + num_threads=self.num_threads, + batch_size=self.get_batch_size(self.n_trees), + ) + + self.bias + ) + + def loss(self, weights, known_data, known_labels): + v = ( + calc_paths_sum( + self.selectors, + self.node_offsets, + known_data, + weights[1:], + num_threads=self.num_threads, + batch_size=self.get_batch_size(self.n_trees), + ) + + weights[0] + ) + + return -np.sum(log_expit(known_labels * v)) + + def loss_gradient(self, weights, known_data, known_labels): + v = ( + calc_paths_sum( + self.selectors, + self.node_offsets, + known_data, + weights[1:], + num_threads=self.num_threads, + batch_size=self.get_batch_size(self.n_trees), + ) + + weights[0] + ) + + dloss_dv = -known_labels * expit(-known_labels * v) + dloss_dbias = np.sum(dloss_dv) + dloss_dweights = calc_paths_sum_transpose( + self.selectors, + self.node_offsets, + self.leaf_offsets, + known_data, + dloss_dv, + num_threads=self.num_threads, + batch_size=self.get_batch_size(len(known_data)), + ) + + return np.concatenate([[dloss_dbias], dloss_dweights]) + + def loss_hessian(self, weights, vector, known_data, known_labels): + pass + + +class AADHingeEvaluator(AADEvaluator): + def __init__(self, aad): + super(AADHingeEvaluator, self).__init__(aad) + self.C_a = aad.C_a + self.budget = aad.budget + self.prior_influence = aad.prior_influence self.weights = np.full(shape=(self.n_leaves,), fill_value=np.reciprocal(np.sqrt(self.n_leaves))) + def _q_tau(self, scores): + if isinstance(self.budget, int): + if self.budget >= len(scores): + return np.max(scores) + + return np.partition(scores, self.budget)[self.budget] + elif isinstance(self.budget, float): + return np.quantile(scores, self.budget) + + raise ValueError("self.budget must be an int or float") + def score_samples(self, x, weights=None): """ Perform the computations. @@ -140,6 +248,26 @@ def loss_hessian( ): return vector * prior_influence + def fit_known(self, data, known_data, known_labels): + scores = self.score_samples(data) + q_tau = self._q_tau(scores) + + anomaly_count = np.count_nonzero(known_labels == Label.ANOMALY) + nominal_count = np.count_nonzero(known_labels == Label.REGULAR) + prior_influence = self.prior_influence(anomaly_count, nominal_count) + + res = minimize( + self.loss, + self.weights, + args=(known_data, known_labels, anomaly_count, nominal_count, q_tau, self.C_a, prior_influence), + method="trust-krylov", + jac=self.loss_gradient, + hessp=self.loss_hessian, + tol=1e-4, + ) + weights_norm = np.sqrt(np.inner(res.x, res.x)) + self.weights = res.x / weights_norm + class AADForest(Coniferest): """ @@ -176,6 +304,10 @@ class AADForest(Coniferest): map_value : ["const", "exponential", "linear", "reciprocal"] or callable, optional An function applied to the leaf depth before weighting. Possible meaning variants are: 1, 1-exp(-x), x, -1/x. + + loss : ["hinge"], optional (default="hinge") + Loss function used to optimize the leaf weights. The default is the hinge loss, + as in the original paper. """ def __init__( @@ -190,6 +322,7 @@ def __init__( random_seed=None, sampletrees_per_batch=1 << 20, map_value=None, + loss="hinge", ): super().__init__( trees=[], @@ -231,23 +364,19 @@ def __init__( else: raise ValueError(f"map_value is neither a callable nor one of {', '.join(MAP_VALUES.keys())}.") + LOSSES = ["hinge"] + + if loss not in LOSSES: + raise ValueError(f"loss is not one of {', '.join(LOSSES)}.") + + self.loss = loss + self.evaluator = None def _build_trees(self, data): if len(self.trees) == 0: self.trees = self.build_trees(data, self.n_trees) - self.evaluator = AADEvaluator(self) - - def _q_tau(self, scores): - if isinstance(self.budget, int): - if self.budget >= len(scores): - return np.max(scores) - - return np.partition(scores, self.budget)[self.budget] - elif isinstance(self.budget, float): - return np.quantile(scores, self.budget) - - raise ValueError("self.budget must be an int or float") + self.evaluator = AADHingeEvaluator(self) def fit(self, data, labels=None): """ @@ -311,39 +440,7 @@ def fit_known(self, data, known_data=None, known_labels=None): ): return self - scores = self.score_samples(data) - q_tau = self._q_tau(scores) - - anomaly_count = np.count_nonzero(known_labels == Label.ANOMALY) - nominal_count = np.count_nonzero(known_labels == Label.REGULAR) - prior_influence = self.prior_influence(anomaly_count, nominal_count) - - def fun(weights): - return self.evaluator.loss( - weights, known_data, known_labels, anomaly_count, nominal_count, q_tau, self.C_a, prior_influence - ) - - def jac(weights): - return self.evaluator.loss_gradient( - weights, known_data, known_labels, anomaly_count, nominal_count, q_tau, self.C_a, prior_influence - ) - - def hessp(weights, vector): - return self.evaluator.loss_hessian( - weights, - vector, - known_data, - known_labels, - anomaly_count, - nominal_count, - q_tau, - self.C_a, - prior_influence, - ) - - res = minimize(fun, self.evaluator.weights, method="trust-krylov", jac=jac, hessp=hessp, tol=1e-4) - weights_norm = np.sqrt(np.inner(res.x, res.x)) - self.evaluator.weights = res.x / weights_norm + self.evaluator.fit_known(data, known_data, known_labels) return self diff --git a/tests/test_aadforest.py b/tests/test_aadforest.py index 331f364..01190c6 100644 --- a/tests/test_aadforest.py +++ b/tests/test_aadforest.py @@ -113,7 +113,7 @@ def test_benchmark_loss_gradient(n_samples, n_trees, n_jobs, benchmark): anomaly_count = np.count_nonzero(known_labels == Label.ANOMALY) nominal_count = np.count_nonzero(known_labels == Label.REGULAR) scores = forest.score_samples(data) - q_tau = forest._q_tau(scores) + q_tau = forest.evaluator._q_tau(scores) benchmark( forest.evaluator.loss_gradient,