Skip to content

Commit 32b2bd4

Browse files
authored
Merge pull request #4120 from PrimozGodec/limit-discrete-variable
[FIX] Warning for discrete variable with >100 values in OWFile
2 parents 1fb5f8a + a480971 commit 32b2bd4

File tree

3 files changed

+84
-1
lines changed

3 files changed

+84
-1
lines changed

Orange/data/tests/test_io.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import unittest
2+
import numpy as np
3+
4+
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable
5+
from Orange.data.io import guess_data_type
6+
7+
8+
class TestTableFilters(unittest.TestCase):
9+
10+
def test_guess_data_type_continuous(self):
11+
# should be ContinuousVariable
12+
valuemap, values, coltype = guess_data_type(list(range(1, 100)))
13+
self.assertEqual(ContinuousVariable, coltype)
14+
self.assertIsNone(valuemap)
15+
np.testing.assert_array_equal(np.array(list(range(1, 100))), values)
16+
17+
valuemap, values, coltype = guess_data_type([1, 2, 3, 1, 2, 3])
18+
self.assertEqual(ContinuousVariable, coltype)
19+
self.assertIsNone(valuemap)
20+
np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values)
21+
22+
valuemap, values, coltype = guess_data_type(
23+
["1", "2", "3", "1", "2", "3"])
24+
self.assertEqual(ContinuousVariable, coltype)
25+
self.assertIsNone(valuemap)
26+
np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values)
27+
28+
def test_guess_data_type_discrete(self):
29+
# should be DiscreteVariable
30+
valuemap, values, coltype = guess_data_type([1, 2, 1, 2])
31+
self.assertEqual(DiscreteVariable, coltype)
32+
self.assertEqual([1, 2], valuemap)
33+
np.testing.assert_array_equal([1, 2, 1, 2], values)
34+
35+
valuemap, values, coltype = guess_data_type(["1", "2", "1", "2", "a"])
36+
self.assertEqual(DiscreteVariable, coltype)
37+
self.assertEqual(["1", "2", "a"], valuemap)
38+
np.testing.assert_array_equal(['1', '2', '1', '2', 'a'], values)
39+
40+
# just below the threshold for string variable
41+
in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76
42+
valuemap, values, coltype = guess_data_type(in_values)
43+
self.assertEqual(DiscreteVariable, coltype)
44+
self.assertEqual(sorted(set(in_values)), valuemap)
45+
np.testing.assert_array_equal(in_values, values)
46+
47+
def test_guess_data_type_string(self):
48+
# should be StringVariable
49+
# too many different values for discrete
50+
in_values = list(map(lambda x: str(x) + "a", range(90)))
51+
valuemap, values, coltype = guess_data_type(in_values)
52+
self.assertEqual(StringVariable, coltype)
53+
self.assertIsNone(valuemap)
54+
np.testing.assert_array_equal(in_values, values)
55+
56+
# more than len(values)**0.7
57+
in_values = list(map(lambda x: str(x) + "a", range(25))) + ["a"] * 75
58+
valuemap, values, coltype = guess_data_type(in_values)
59+
self.assertEqual(StringVariable, coltype)
60+
self.assertIsNone(valuemap)
61+
np.testing.assert_array_equal(in_values, values)
62+
63+
# more than 100 different values - exactly 101
64+
# this is the case when len(values)**0.7 rule would vote for the
65+
# DiscreteVariable
66+
in_values = list(map(lambda x: str(x) + "a", range(100))) + ["a"] * 999
67+
valuemap, values, coltype = guess_data_type(in_values)
68+
self.assertEqual(StringVariable, coltype)
69+
self.assertIsNone(valuemap)
70+
np.testing.assert_array_equal(in_values, values)

Orange/data/variable.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
DISCRETE_MAX_VALUES = 3 # == 2 + nan
2828
MAX_NUM_OF_DECIMALS = 5
29+
# the variable with more than 100 different values should not be StringVariable
30+
DISCRETE_MAX_ALLOWED_VALUES = 100
2931

3032

3133
def make_variable(cls, compute_value, *args):
@@ -63,7 +65,8 @@ def is_discrete_values(values):
6365
unique = set()
6466
for i in values:
6567
unique.add(i)
66-
if len(unique) > max_values:
68+
if (len(unique) > max_values or
69+
len(unique) > DISCRETE_MAX_ALLOWED_VALUES):
6770
return False
6871

6972
# Strip NaN from unique

Orange/widgets/data/owfile.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import logging
3+
from itertools import chain
34
from warnings import catch_warnings
45
from urllib.parse import urlparse
56
from typing import List
@@ -123,6 +124,8 @@ class Warning(widget.OWWidget.Warning):
123124
file_too_big = widget.Msg("The file is too large to load automatically."
124125
" Press Reload to load.")
125126
load_warning = widget.Msg("Read warning:\n{}")
127+
performance_warning = widget.Msg(
128+
"Categorical variables with >100 values may decrease performance.")
126129

127130
class Error(widget.OWWidget.Error):
128131
file_not_found = widget.Msg("File not found.")
@@ -468,7 +471,13 @@ def reset_domain_edit(self):
468471
self.domain_editor.reset_domain()
469472
self.apply_domain_edit()
470473

474+
def _inspect_discrete_variables(self, domain):
475+
for var in chain(domain.variables, domain.metas):
476+
if var.is_discrete and len(var.values) > 100:
477+
self.Warning.performance_warning()
478+
471479
def apply_domain_edit(self):
480+
self.Warning.performance_warning.clear()
472481
if self.data is None:
473482
table = None
474483
else:
@@ -481,6 +490,7 @@ def apply_domain_edit(self):
481490
table.name = self.data.name
482491
table.ids = np.array(self.data.ids)
483492
table.attributes = getattr(self.data, 'attributes', {})
493+
self._inspect_discrete_variables(domain)
484494

485495
self.Outputs.data.send(table)
486496
self.apply_button.setEnabled(False)

0 commit comments

Comments
 (0)