Skip to content

Commit 462dbe1

Browse files
committed
Outlier detection: keep instance ids, make thread safe
Outlier detection did not keep instance ids, so subsets did not work. Also, it was not thread safe: multiple calls to _OutlierModel.__call__ could result in undefined behaviour, because some caching was done at the object level.
1 parent d4e259f commit 462dbe1

File tree

2 files changed

+46
-39
lines changed

2 files changed

+46
-39
lines changed

Orange/classification/outlier_detection.py

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,43 +9,52 @@
99
from sklearn.svm import OneClassSVM
1010

1111
from Orange.base import SklLearner, SklModel
12-
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \
13-
Variable
14-
from Orange.data.util import get_unique_names
12+
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
13+
from Orange.data.util import get_unique_names, SharedComputeValue
1514
from Orange.preprocess import AdaptiveNormalize
16-
from Orange.util import wrap_callback, dummy_callback
15+
from Orange.util import dummy_callback
1716

1817
__all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner",
1918
"EllipticEnvelopeLearner", "OneClassSVMLearner"]
2019

2120

21+
class _CachedTransform:
22+
# to be used with SharedComputeValue
23+
def __init__(self, model):
24+
self.model = model
25+
26+
def __call__(self, data):
27+
return self.model.data_to_model_domain(data)
28+
29+
2230
class _OutlierModel(SklModel):
2331
def __init__(self, skl_model):
2432
super().__init__(skl_model)
2533
self._cached_data = None
2634
self.outlier_var = None
35+
self.cached_transform = _CachedTransform(self)
2736

2837
def predict(self, X: np.ndarray) -> np.ndarray:
2938
pred = self.skl_model.predict(X)
3039
pred[pred == -1] = 0
3140
return pred[:, None]
3241

42+
def new_domain(self, data: Table) -> Domain:
43+
assert self.outlier_var is not None
44+
return Domain(data.domain.attributes, data.domain.class_vars,
45+
data.domain.metas + (self.outlier_var,))
46+
3347
def __call__(self, data: Table, progress_callback: Callable = None) \
3448
-> Table:
3549
assert isinstance(data, Table)
36-
assert self.outlier_var is not None
3750

38-
domain = Domain(data.domain.attributes, data.domain.class_vars,
39-
data.domain.metas + (self.outlier_var,))
51+
domain = self.new_domain(data)
4052
if progress_callback is None:
4153
progress_callback = dummy_callback
42-
progress_callback(0, "Preprocessing...")
43-
self._cached_data = self.data_to_model_domain(
44-
data, wrap_callback(progress_callback, end=0.1))
45-
progress_callback(0.1, "Predicting...")
46-
metas = np.hstack((data.metas, self.predict(self._cached_data.X)))
54+
progress_callback(0, "Predicting...")
55+
new_table = data.transform(domain)
4756
progress_callback(1)
48-
return Table.from_numpy(domain, data.X, data.Y, metas)
57+
return new_table
4958

5059

5160
class _OutlierLearner(SklLearner):
@@ -64,27 +73,17 @@ def _fit_model(self, data: Table) -> _OutlierModel:
6473
compute_value=transformer
6574
)
6675

67-
transformer.variable = variable
6876
model.outlier_var = variable
6977
return model
7078

7179

72-
class _Transformer:
80+
class _Transformer(SharedComputeValue):
7381
def __init__(self, model: _OutlierModel):
82+
super().__init__(model.cached_transform)
7483
self._model = model
75-
self._variable = None
76-
77-
@property
78-
def variable(self) -> Variable:
79-
return self._variable
8084

81-
@variable.setter
82-
def variable(self, var: Variable):
83-
self._variable = var
84-
85-
def __call__(self, data: Table) -> np.ndarray:
86-
assert isinstance(self._variable, Variable)
87-
return self._model(data).get_column_view(self._variable)[0]
85+
def compute(self, data: Table, shared_data: Table) -> np.ndarray:
86+
return self._model.predict(shared_data.X)[:, 0]
8887

8988

9089
class OneClassSVMLearner(_OutlierLearner):
@@ -142,13 +141,16 @@ def mahalanobis(self, observations: np.ndarray) -> np.ndarray:
142141
"""
143142
return self.skl_model.mahalanobis(observations)[:, None]
144143

145-
def __call__(self, data: Table, progress_callback: Callable = None) \
146-
-> Table:
147-
pred = super().__call__(data, progress_callback)
148-
domain = Domain(pred.domain.attributes, pred.domain.class_vars,
149-
pred.domain.metas + (self.mahal_var,))
150-
metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X)))
151-
return Table.from_numpy(domain, pred.X, pred.Y, metas)
144+
def new_domain(self, data: Table) -> Domain:
145+
assert self.mahal_var is not None
146+
domain = super().new_domain(data)
147+
return Domain(domain.attributes, domain.class_vars,
148+
domain.metas + (self.mahal_var,))
149+
150+
151+
class _TransformerMahalanobis(_Transformer):
152+
def compute(self, data: Table, shared_data: Table) -> np.ndarray:
153+
return self._model.mahalanobis(shared_data.X)[:, 0]
152154

153155

154156
class EllipticEnvelopeLearner(_OutlierLearner):
@@ -166,13 +168,12 @@ def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier:
166168
domain = data.domain
167169
model = super()._fit_model(data.transform(Domain(domain.attributes)))
168170

169-
transformer = _Transformer(model)
171+
transformer = _TransformerMahalanobis(model)
170172
names = [v.name for v in domain.variables + domain.metas]
171173
variable = ContinuousVariable(
172174
get_unique_names(names, "Mahalanobis"),
173175
compute_value=transformer
174176
)
175177

176-
transformer.variable = variable
177178
model.mahal_var = variable
178179
return model

Orange/classification/tests/test_outlier_detection.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pickle
44
import tempfile
55
import unittest
6-
from unittest.mock import Mock
6+
from unittest.mock import Mock, patch
77

88
import numpy as np
99

@@ -36,6 +36,7 @@ def assert_table_equal(self, table1, table2):
3636
np.testing.assert_array_equal(table1.metas, table2.metas)
3737

3838
def assert_table_appended_outlier(self, table1, table2, offset=1):
39+
np.testing.assert_array_equal(table1.ids, table2.ids)
3940
np.testing.assert_array_equal(table1.X, table2.X)
4041
np.testing.assert_array_equal(table1.Y, table2.Y)
4142
np.testing.assert_array_equal(table1.metas, table2.metas[:, :-offset])
@@ -47,7 +48,6 @@ def assert_table_appended_outlier(self, table1, table2, offset=1):
4748
self.assertEqual(table2.domain.metas[-offset].name, "Outlier")
4849
self.assertIsNotNone(table2.domain.metas[-offset].compute_value)
4950

50-
5151
class TestOneClassSVMLearner(_TestDetector):
5252
def test_OneClassSVM(self):
5353
np.random.seed(42)
@@ -128,12 +128,19 @@ def test_EllipticEnvelope(self):
128128
def test_mahalanobis(self):
129129
n = len(self.X_all)
130130
pred = self.model(self.X_all)
131+
131132
y_pred = pred[:, self.model.outlier_var].metas
132133
y_mahal = pred[:, self.model.mahal_var].metas
133134
y_mahal, y_pred = zip(*sorted(zip(y_mahal, y_pred), reverse=True))
134135
self.assertTrue(all(i == 0 for i in y_pred[:int(self.cont * n)]))
135136
self.assertTrue(all(i == 1 for i in y_pred[int(self.cont * n):]))
136137

138+
def test_single_data_to_model_domain(self):
139+
with patch.object(self.model, "data_to_model_domain",
140+
wraps=self.model.data_to_model_domain) as call:
141+
self.model(self.X_all)
142+
self.assertEqual(call.call_count, 1)
143+
137144
def test_EllipticEnvelope_ignores_y(self):
138145
domain = Domain((ContinuousVariable("x1"), ContinuousVariable("x2")),
139146
(ContinuousVariable("y1"), ContinuousVariable("y2")))
@@ -231,7 +238,6 @@ def test_transformer(self):
231238
detect = self.detector(self.iris)
232239
pred = detect(self.iris)
233240
var = pred.domain.metas[0]
234-
self.assertIs(var, var.compute_value.variable)
235241
np.testing.assert_array_equal(pred[:, "Outlier"].metas.ravel(),
236242
var.compute_value(self.iris))
237243

0 commit comments

Comments
 (0)