-
Notifications
You must be signed in to change notification settings - Fork 7
Add support for multiple loss functions in AADForest #267
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
32406d6
220236e
b613b66
2b74f34
8a6c272
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,8 +1,10 @@ | ||||||||||||
| from abc import abstractmethod | ||||||||||||
| from numbers import Real | ||||||||||||
| from typing import Callable | ||||||||||||
|
|
||||||||||||
| import numpy as np | ||||||||||||
| from scipy.optimize import minimize | ||||||||||||
| from scipy.special import expit, log_expit | ||||||||||||
|
|
||||||||||||
| from .calc_trees import calc_paths_sum, calc_paths_sum_transpose # noqa | ||||||||||||
| from .coniferest import Coniferest, ConiferestEvaluator | ||||||||||||
|
|
@@ -14,8 +16,114 @@ | |||||||||||
| class AADEvaluator(ConiferestEvaluator): | ||||||||||||
| def __init__(self, aad): | ||||||||||||
| super(AADEvaluator, self).__init__(aad, map_value=aad.map_value) | ||||||||||||
|
|
||||||||||||
| @abstractmethod | ||||||||||||
| def score_samples(self, samples, weights=None): | ||||||||||||
| """ | ||||||||||||
| Evaluate scores for samples. | ||||||||||||
| """ | ||||||||||||
| raise NotImplementedError() | ||||||||||||
|
|
||||||||||||
| @abstractmethod | ||||||||||||
| def fit_known(self, data, known_data, known_labels): | ||||||||||||
| """ | ||||||||||||
| Evaluate scores for samples. | ||||||||||||
| """ | ||||||||||||
| raise NotImplementedError() | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| class AADCrossEntropyEvaluator(AADEvaluator): | ||||||||||||
| def __init__(self, aad): | ||||||||||||
| super(AADCrossEntropyEvaluator, self).__init__(aad) | ||||||||||||
| self.weights = np.ones(shape=(self.n_leaves,)) | ||||||||||||
| self.bias = 0.0 # Not sure about 0.0 | ||||||||||||
|
|
||||||||||||
| def score_samples(self, x, weights=None): | ||||||||||||
| # Anomaly score is a probability of being REGULAR data. | ||||||||||||
|
|
||||||||||||
| if not x.flags["C_CONTIGUOUS"]: | ||||||||||||
| x = np.ascontiguousarray(x) | ||||||||||||
|
|
||||||||||||
| if weights is None: | ||||||||||||
| weights = self.weights | ||||||||||||
|
|
||||||||||||
| return expit( | ||||||||||||
| calc_paths_sum( | ||||||||||||
| self.selectors, | ||||||||||||
| self.node_offsets, | ||||||||||||
| x, | ||||||||||||
| weights, | ||||||||||||
| num_threads=self.num_threads, | ||||||||||||
| batch_size=self.get_batch_size(self.n_trees), | ||||||||||||
| ) | ||||||||||||
| + self.bias | ||||||||||||
| ) | ||||||||||||
|
|
||||||||||||
| def loss(self, weights, known_data, known_labels): | ||||||||||||
| v = ( | ||||||||||||
| calc_paths_sum( | ||||||||||||
| self.selectors, | ||||||||||||
| self.node_offsets, | ||||||||||||
| known_data, | ||||||||||||
| weights[1:], | ||||||||||||
| num_threads=self.num_threads, | ||||||||||||
| batch_size=self.get_batch_size(self.n_trees), | ||||||||||||
| ) | ||||||||||||
| + weights[0] | ||||||||||||
| ) | ||||||||||||
|
|
||||||||||||
| return -np.sum(log_expit(known_labels * v)) | ||||||||||||
|
|
||||||||||||
| def loss_gradient(self, weights, known_data, known_labels): | ||||||||||||
| v = ( | ||||||||||||
| calc_paths_sum( | ||||||||||||
| self.selectors, | ||||||||||||
| self.node_offsets, | ||||||||||||
| known_data, | ||||||||||||
| weights[1:], | ||||||||||||
| num_threads=self.num_threads, | ||||||||||||
| batch_size=self.get_batch_size(self.n_trees), | ||||||||||||
| ) | ||||||||||||
| + weights[0] | ||||||||||||
| ) | ||||||||||||
|
|
||||||||||||
| dloss_dv = -known_labels * expit(-known_labels * v) | ||||||||||||
| dloss_dbias = np.sum(dloss_dv) | ||||||||||||
| dloss_dweights = calc_paths_sum_transpose( | ||||||||||||
| self.selectors, | ||||||||||||
| self.node_offsets, | ||||||||||||
| self.leaf_offsets, | ||||||||||||
| known_data, | ||||||||||||
| dloss_dv, | ||||||||||||
| num_threads=self.num_threads, | ||||||||||||
| batch_size=self.get_batch_size(len(known_data)), | ||||||||||||
| ) | ||||||||||||
|
|
||||||||||||
| return np.concatenate([[dloss_dbias], dloss_dweights]) | ||||||||||||
|
|
||||||||||||
| def loss_hessian(self, weights, vector, known_data, known_labels): | ||||||||||||
| pass | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| class AADHingeEvaluator(AADEvaluator): | ||||||||||||
| def __init__(self, aad): | ||||||||||||
| super(AADHingeEvaluator, self).__init__(aad) | ||||||||||||
| self.C_a = aad.C_a | ||||||||||||
| self.budget = aad.budget | ||||||||||||
| self.prior_influence = aad.prior_influence | ||||||||||||
| self.weights = np.full(shape=(self.n_leaves,), fill_value=np.reciprocal(np.sqrt(self.n_leaves))) | ||||||||||||
|
|
||||||||||||
| def _q_tau(self, scores): | ||||||||||||
| if isinstance(self.budget, int): | ||||||||||||
| if self.budget >= len(scores): | ||||||||||||
| return np.max(scores) | ||||||||||||
|
|
||||||||||||
| return np.partition(scores, self.budget)[self.budget] | ||||||||||||
| elif isinstance(self.budget, float): | ||||||||||||
| return np.quantile(scores, self.budget) | ||||||||||||
|
|
||||||||||||
| raise ValueError("self.budget must be an int or float") | ||||||||||||
|
|
||||||||||||
| def score_samples(self, x, weights=None): | ||||||||||||
| """ | ||||||||||||
| Perform the computations. | ||||||||||||
|
|
@@ -140,6 +248,26 @@ def loss_hessian( | |||||||||||
| ): | ||||||||||||
| return vector * prior_influence | ||||||||||||
|
|
||||||||||||
| def fit_known(self, data, known_data, known_labels): | ||||||||||||
| scores = self.score_samples(data) | ||||||||||||
| q_tau = self._q_tau(scores) | ||||||||||||
|
|
||||||||||||
| anomaly_count = np.count_nonzero(known_labels == Label.ANOMALY) | ||||||||||||
| nominal_count = np.count_nonzero(known_labels == Label.REGULAR) | ||||||||||||
| prior_influence = self.prior_influence(anomaly_count, nominal_count) | ||||||||||||
|
|
||||||||||||
| res = minimize( | ||||||||||||
| self.loss, | ||||||||||||
| self.weights, | ||||||||||||
| args=(known_data, known_labels, anomaly_count, nominal_count, q_tau, self.C_a, prior_influence), | ||||||||||||
| method="trust-krylov", | ||||||||||||
| jac=self.loss_gradient, | ||||||||||||
| hessp=self.loss_hessian, | ||||||||||||
| tol=1e-4, | ||||||||||||
| ) | ||||||||||||
| weights_norm = np.sqrt(np.inner(res.x, res.x)) | ||||||||||||
| self.weights = res.x / weights_norm | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| class AADForest(Coniferest): | ||||||||||||
| """ | ||||||||||||
|
|
@@ -176,6 +304,10 @@ class AADForest(Coniferest): | |||||||||||
| map_value : ["const", "exponential", "linear", "reciprocal"] or callable, optional | ||||||||||||
| An function applied to the leaf depth before weighting. Possible | ||||||||||||
| meaning variants are: 1, 1-exp(-x), x, -1/x. | ||||||||||||
|
|
||||||||||||
| loss : ["hinge"], optional (default="hinge") | ||||||||||||
| Loss function used to optimize the leaf weights. The default is the hinge loss, | ||||||||||||
| as in the original paper. | ||||||||||||
| """ | ||||||||||||
|
|
||||||||||||
| def __init__( | ||||||||||||
|
|
@@ -190,6 +322,7 @@ def __init__( | |||||||||||
| random_seed=None, | ||||||||||||
| sampletrees_per_batch=1 << 20, | ||||||||||||
| map_value=None, | ||||||||||||
| loss="hinge", | ||||||||||||
| ): | ||||||||||||
| super().__init__( | ||||||||||||
| trees=[], | ||||||||||||
|
|
@@ -231,23 +364,19 @@ def __init__( | |||||||||||
| else: | ||||||||||||
| raise ValueError(f"map_value is neither a callable nor one of {', '.join(MAP_VALUES.keys())}.") | ||||||||||||
|
|
||||||||||||
|
||||||||||||
| # Currently, only the "hinge" loss function is supported. | |
| # This list is designed for future extensibility to include additional loss functions. |
Copilot
AI
Jul 30, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The loss parameter is stored but never used in the implementation. Consider either implementing the loss function logic or adding a comment explaining how this will be used in future implementations.
| self.loss = loss | |
| self.loss = loss | |
| # The `loss` parameter is currently not used in the implementation. | |
| # It is reserved for future extensions where different loss functions | |
| # may be incorporated into the anomaly detection logic. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
lossparameter is added to support multiple loss functions, but the implementation doesn't actually use this parameter. The evaluator always uses the same loss function regardless of thelossvalue. Consider implementing the loss function selection logic or removing the unused parameter.