Skip to content

Commit c11b480

Browse files
committed
OWDistances: Use only binary features in Jaccard
1 parent 5eba886 commit c11b480

File tree

4 files changed

+67
-6
lines changed

4 files changed

+67
-6
lines changed

Orange/distance/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute,
44
Mahalanobis, MahalanobisDistance, Hamming)
55

6-
from .base import _preprocess, remove_discrete_features, impute
6+
from .base import (
7+
_preprocess, remove_discrete_features, remove_nonbinary_features, impute)

Orange/distance/base.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ def remove_discrete_features(data):
3838
return data.transform(new_domain)
3939

4040

41+
def remove_nonbinary_features(data):
42+
"""Remove non-binary columns from the data."""
43+
new_domain = Domain(
44+
[a for a in data.domain.attributes
45+
if a.is_discrete and len(a.values) == 2],
46+
data.domain.class_vars,
47+
data.domain.metas)
48+
return data.transform(new_domain)
49+
4150
def impute(data):
4251
"""Impute missing values."""
4352
return SklImpute()(data)

Orange/widgets/unsupervised/owdistances.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,14 @@ class Outputs:
5353

5454
class Error(OWWidget.Error):
5555
no_continuous_features = Msg("No numeric features")
56+
no_binary_features = Msg("No binary features")
5657
dense_metric_sparse_data = Msg("{} requires dense data.")
5758
distances_memory_error = Msg("Not enough memory")
5859
distances_value_error = Msg("Problem in calculation:\n{}")
5960

6061
class Warning(OWWidget.Warning):
6162
ignoring_discrete = Msg("Ignoring categorical features")
63+
ignoring_nonbinary = Msg("Ignoring non-binary features")
6264
imputing_data = Msg("Missing values were imputed")
6365

6466
def __init__(self):
@@ -112,6 +114,7 @@ def _check_sparse():
112114
if issparse(data.X) and not metric.supports_sparse:
113115
self.Error.dense_metric_sparse_data(METRICS[self.metric_idx][0])
114116
return False
117+
return True
115118

116119
def _fix_discrete():
117120
nonlocal data
@@ -124,18 +127,34 @@ def _fix_discrete():
124127
return False
125128
self.Warning.ignoring_discrete()
126129
data = distance.remove_discrete_features(data)
130+
return True
131+
132+
def _fix_nonbinary():
133+
nonlocal data
134+
if metric is distance.Jaccard:
135+
nbinary = sum(a.is_discrete and len(a.values) == 2
136+
for a in data.domain.attributes)
137+
if not nbinary:
138+
self.Error.no_binary_features()
139+
return False
140+
elif nbinary < len(data.domain.attributes):
141+
self.Warning.ignoring_nonbinary()
142+
data = distance.remove_nonbinary_features(data)
143+
return True
127144

128145
def _fix_missing():
129146
nonlocal data
130147
if not metric.supports_missing and bn.anynan(data.X):
131148
self.Warning.imputing_data()
132149
data = distance.impute(data)
150+
return True
133151

134152
self.clear_messages()
135153
if data is None:
136154
return
137-
for check in (_check_sparse, _fix_discrete, _fix_missing):
138-
if check() is False:
155+
for check in (_check_sparse,
156+
_fix_discrete, _fix_missing, _fix_nonbinary):
157+
if not check():
139158
return
140159
try:
141160
if metric.supports_normalization and self.normalized_dist:

Orange/widgets/unsupervised/tests/test_owdistances.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
import numpy as np
66

7-
from Orange.data import Table
7+
from Orange.data import Table, Domain
8+
from Orange import distance
89
from Orange.widgets.unsupervised.owdistances import OWDistances, METRICS
910
from Orange.widgets.tests.base import WidgetTest
1011

@@ -32,8 +33,9 @@ def test_distance_combo(self):
3233
else:
3334
expected = metric(self.iris)
3435

35-
np.testing.assert_array_equal(
36-
expected, self.get_output(self.widget.Outputs.distances))
36+
if metric is not distance.Jaccard:
37+
np.testing.assert_array_equal(
38+
expected, self.get_output(self.widget.Outputs.distances))
3739

3840
def test_error_message(self):
3941
"""Check if error message appears and then disappears when
@@ -46,6 +48,36 @@ def test_error_message(self):
4648
self.send_signal(self.widget.Inputs.data, None)
4749
self.assertFalse(self.widget.Error.no_continuous_features.is_shown())
4850

51+
def test_jaccard_messages(self):
52+
for self.widget.metric_idx, (name, _) in enumerate(METRICS):
53+
if name == "Jaccard":
54+
break
55+
self.send_signal(self.widget.Inputs.data, self.iris)
56+
self.assertTrue(self.widget.Error.no_binary_features.is_shown())
57+
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())
58+
59+
self.send_signal(self.widget.Inputs.data, None)
60+
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
61+
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())
62+
63+
self.send_signal(self.widget.Inputs.data, self.titanic)
64+
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
65+
self.assertTrue(self.widget.Warning.ignoring_nonbinary.is_shown())
66+
67+
self.send_signal(self.widget.Inputs.data, None)
68+
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
69+
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())
70+
71+
self.send_signal(self.widget.Inputs.data, self.titanic)
72+
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
73+
self.assertTrue(self.widget.Warning.ignoring_nonbinary.is_shown())
74+
75+
dom = self.titanic.domain
76+
dom = Domain(dom.attributes[1:], dom.class_var)
77+
self.send_signal(self.widget.Inputs.data, self.titanic.transform(dom))
78+
self.assertFalse(self.widget.Error.no_binary_features.is_shown())
79+
self.assertFalse(self.widget.Warning.ignoring_nonbinary.is_shown())
80+
4981
def test_too_big_array(self):
5082
"""
5183
Users sees an error message when calculating too large arrays and Orange

0 commit comments

Comments
 (0)