Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 142 additions & 45 deletions src/coniferest/aadforest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from abc import abstractmethod
from numbers import Real
from typing import Callable

import numpy as np
from scipy.optimize import minimize
from scipy.special import expit, log_expit

from .calc_trees import calc_paths_sum, calc_paths_sum_transpose # noqa
from .coniferest import Coniferest, ConiferestEvaluator
Expand All @@ -14,8 +16,114 @@
class AADEvaluator(ConiferestEvaluator):
def __init__(self, aad):
super(AADEvaluator, self).__init__(aad, map_value=aad.map_value)

@abstractmethod
def score_samples(self, samples, weights=None):
"""
Evaluate scores for samples.
"""
raise NotImplementedError()

@abstractmethod
def fit_known(self, data, known_data, known_labels):
"""
Evaluate scores for samples.
"""
raise NotImplementedError()


class AADCrossEntropyEvaluator(AADEvaluator):
def __init__(self, aad):
super(AADCrossEntropyEvaluator, self).__init__(aad)
self.weights = np.ones(shape=(self.n_leaves,))
self.bias = 0.0 # Not sure about 0.0

def score_samples(self, x, weights=None):
# Anomaly score is a probability of being REGULAR data.

if not x.flags["C_CONTIGUOUS"]:
x = np.ascontiguousarray(x)

if weights is None:
weights = self.weights

return expit(
calc_paths_sum(
self.selectors,
self.node_offsets,
x,
weights,
num_threads=self.num_threads,
batch_size=self.get_batch_size(self.n_trees),
)
+ self.bias
)

def loss(self, weights, known_data, known_labels):
v = (
calc_paths_sum(
self.selectors,
self.node_offsets,
known_data,
weights[1:],
num_threads=self.num_threads,
batch_size=self.get_batch_size(self.n_trees),
)
+ weights[0]
)

return -np.sum(log_expit(known_labels * v))

def loss_gradient(self, weights, known_data, known_labels):
v = (
calc_paths_sum(
self.selectors,
self.node_offsets,
known_data,
weights[1:],
num_threads=self.num_threads,
batch_size=self.get_batch_size(self.n_trees),
)
+ weights[0]
)

dloss_dv = -known_labels * expit(-known_labels * v)
dloss_dbias = np.sum(dloss_dv)
dloss_dweights = calc_paths_sum_transpose(
self.selectors,
self.node_offsets,
self.leaf_offsets,
known_data,
dloss_dv,
num_threads=self.num_threads,
batch_size=self.get_batch_size(len(known_data)),
)

return np.concatenate([[dloss_dbias], dloss_dweights])

def loss_hessian(self, weights, vector, known_data, known_labels):
pass


class AADHingeEvaluator(AADEvaluator):
def __init__(self, aad):
super(AADHingeEvaluator, self).__init__(aad)
self.C_a = aad.C_a
self.budget = aad.budget
self.prior_influence = aad.prior_influence
self.weights = np.full(shape=(self.n_leaves,), fill_value=np.reciprocal(np.sqrt(self.n_leaves)))

def _q_tau(self, scores):
if isinstance(self.budget, int):
if self.budget >= len(scores):
return np.max(scores)

return np.partition(scores, self.budget)[self.budget]
elif isinstance(self.budget, float):
return np.quantile(scores, self.budget)

raise ValueError("self.budget must be an int or float")

def score_samples(self, x, weights=None):
"""
Perform the computations.
Expand Down Expand Up @@ -140,6 +248,26 @@ def loss_hessian(
):
return vector * prior_influence

def fit_known(self, data, known_data, known_labels):
scores = self.score_samples(data)
q_tau = self._q_tau(scores)

anomaly_count = np.count_nonzero(known_labels == Label.ANOMALY)
nominal_count = np.count_nonzero(known_labels == Label.REGULAR)
prior_influence = self.prior_influence(anomaly_count, nominal_count)

res = minimize(
self.loss,
self.weights,
args=(known_data, known_labels, anomaly_count, nominal_count, q_tau, self.C_a, prior_influence),
method="trust-krylov",
jac=self.loss_gradient,
hessp=self.loss_hessian,
tol=1e-4,
)
weights_norm = np.sqrt(np.inner(res.x, res.x))
self.weights = res.x / weights_norm


class AADForest(Coniferest):
"""
Expand Down Expand Up @@ -176,6 +304,10 @@ class AADForest(Coniferest):
map_value : ["const", "exponential", "linear", "reciprocal"] or callable, optional
An function applied to the leaf depth before weighting. Possible
meaning variants are: 1, 1-exp(-x), x, -1/x.

loss : ["hinge"], optional (default="hinge")
Loss function used to optimize the leaf weights. The default is the hinge loss,
as in the original paper.
"""

def __init__(
Expand All @@ -190,6 +322,7 @@ def __init__(
random_seed=None,
sampletrees_per_batch=1 << 20,
map_value=None,
loss="hinge",
Copy link

Copilot AI Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loss parameter is added to support multiple loss functions, but the implementation doesn't actually use this parameter. The evaluator always uses the same loss function regardless of the loss value. Consider implementing the loss function selection logic or removing the unused parameter.

Copilot uses AI. Check for mistakes.
):
super().__init__(
trees=[],
Expand Down Expand Up @@ -231,23 +364,19 @@ def __init__(
else:
raise ValueError(f"map_value is neither a callable nor one of {', '.join(MAP_VALUES.keys())}.")

Copy link

Copilot AI Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The LOSSES list contains only one item, making the validation somewhat redundant. Consider either removing this validation until multiple loss functions are implemented, or add a comment explaining the future extensibility plan.

Suggested change
# Currently, only the "hinge" loss function is supported.
# This list is designed for future extensibility to include additional loss functions.

Copilot uses AI. Check for mistakes.
LOSSES = ["hinge"]

if loss not in LOSSES:
raise ValueError(f"loss is not one of {', '.join(LOSSES)}.")

self.loss = loss
Copy link

Copilot AI Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loss parameter is stored but never used in the implementation. Consider either implementing the loss function logic or adding a comment explaining how this will be used in future implementations.

Suggested change
self.loss = loss
self.loss = loss
# The `loss` parameter is currently not used in the implementation.
# It is reserved for future extensions where different loss functions
# may be incorporated into the anomaly detection logic.

Copilot uses AI. Check for mistakes.

self.evaluator = None

def _build_trees(self, data):
if len(self.trees) == 0:
self.trees = self.build_trees(data, self.n_trees)
self.evaluator = AADEvaluator(self)

def _q_tau(self, scores):
if isinstance(self.budget, int):
if self.budget >= len(scores):
return np.max(scores)

return np.partition(scores, self.budget)[self.budget]
elif isinstance(self.budget, float):
return np.quantile(scores, self.budget)

raise ValueError("self.budget must be an int or float")
self.evaluator = AADHingeEvaluator(self)

def fit(self, data, labels=None):
"""
Expand Down Expand Up @@ -311,39 +440,7 @@ def fit_known(self, data, known_data=None, known_labels=None):
):
return self

scores = self.score_samples(data)
q_tau = self._q_tau(scores)

anomaly_count = np.count_nonzero(known_labels == Label.ANOMALY)
nominal_count = np.count_nonzero(known_labels == Label.REGULAR)
prior_influence = self.prior_influence(anomaly_count, nominal_count)

def fun(weights):
return self.evaluator.loss(
weights, known_data, known_labels, anomaly_count, nominal_count, q_tau, self.C_a, prior_influence
)

def jac(weights):
return self.evaluator.loss_gradient(
weights, known_data, known_labels, anomaly_count, nominal_count, q_tau, self.C_a, prior_influence
)

def hessp(weights, vector):
return self.evaluator.loss_hessian(
weights,
vector,
known_data,
known_labels,
anomaly_count,
nominal_count,
q_tau,
self.C_a,
prior_influence,
)

res = minimize(fun, self.evaluator.weights, method="trust-krylov", jac=jac, hessp=hessp, tol=1e-4)
weights_norm = np.sqrt(np.inner(res.x, res.x))
self.evaluator.weights = res.x / weights_norm
self.evaluator.fit_known(data, known_data, known_labels)

return self

Expand Down
2 changes: 1 addition & 1 deletion tests/test_aadforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_benchmark_loss_gradient(n_samples, n_trees, n_jobs, benchmark):
anomaly_count = np.count_nonzero(known_labels == Label.ANOMALY)
nominal_count = np.count_nonzero(known_labels == Label.REGULAR)
scores = forest.score_samples(data)
q_tau = forest._q_tau(scores)
q_tau = forest.evaluator._q_tau(scores)

benchmark(
forest.evaluator.loss_gradient,
Expand Down
Loading