Skip to content

Commit 69e95f2

Browse files
authored
Merge pull request #4793 from PrimozGodec/sort-values
[ENH] Sort values naturally when reading files
2 parents 6790a6c + d8ccced commit 69e95f2

File tree

6 files changed

+95
-6
lines changed

6 files changed

+95
-6
lines changed

Orange/data/io_base.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
isnastr, guess_data_type, sanitize_variable
2323
from Orange.data.util import get_unique_names_duplicates
2424
from Orange.data.variable import VariableMeta
25+
from Orange.misc.collections import natural_sorted
2526
from Orange.util import Registry, flatten, namegen
2627

2728
__all__ = ["FileFormatBase", "Flags", "DataTableMixin", "PICKLE_PROTOCOL"]
@@ -278,7 +279,7 @@ def _disc_column(data: np.ndarray, col: int) -> \
278279
def _disc_no_vals_column(data: np.ndarray, col: int, **_) -> \
279280
_ColumnProperties:
280281
vals, coltype = _TableBuilder._disc_column(data, col)
281-
return _ColumnProperties(valuemap=sorted(set(vals) - {""}),
282+
return _ColumnProperties(valuemap=natural_sorted(set(vals) - {""}),
282283
values=vals, coltype=coltype,
283284
orig_values=vals)
284285

Orange/data/io_util.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
is_discrete_values, MISSING_VALUES, Variable,
99
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
1010
)
11+
from Orange.misc.collections import natural_sorted
1112

1213
__all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr",
1314
"guess_data_type", "sanitize_variable"]
@@ -121,7 +122,7 @@ def guess_data_type(orig_values, namask=None):
121122
if namask is None:
122123
namask = isnastr(orig_values)
123124
if is_discrete:
124-
valuemap = sorted(is_discrete)
125+
valuemap = natural_sorted(is_discrete)
125126
coltype = DiscreteVariable
126127
else:
127128
# try to parse as float

Orange/data/tests/test_io.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
55
TimeVariable
66
from Orange.data.io_util import guess_data_type
7+
from Orange.misc.collections import natural_sorted
78

89

910
class TestTableFilters(unittest.TestCase):
10-
1111
def test_guess_data_type_continuous(self):
1212
# should be ContinuousVariable
1313
valuemap, values, coltype = guess_data_type(list(range(1, 100)))
@@ -42,7 +42,7 @@ def test_guess_data_type_discrete(self):
4242
in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76
4343
valuemap, values, coltype = guess_data_type(in_values)
4444
self.assertEqual(DiscreteVariable, coltype)
45-
self.assertEqual(sorted(set(in_values)), valuemap)
45+
self.assertEqual(natural_sorted(set(in_values)), valuemap)
4646
np.testing.assert_array_equal(in_values, values)
4747

4848
def test_guess_data_type_string(self):
@@ -93,3 +93,21 @@ def test_guess_data_type_time(self):
9393
valuemap, _, coltype = guess_data_type(in_values)
9494
self.assertEqual(TimeVariable, coltype)
9595
self.assertIsNone(valuemap)
96+
97+
def test_guess_data_type_values_order(self):
98+
"""
99+
Test if values are ordered naturally
100+
"""
101+
in_values = [
102+
"something1", "something12", "something2", "something1",
103+
"something20", "something1", "something2", "something12",
104+
"something1", "something12"
105+
]
106+
res = ["something1", "something2", "something12", "something20"]
107+
valuemap, _, coltype = guess_data_type(in_values)
108+
self.assertEqual(DiscreteVariable, coltype)
109+
self.assertListEqual(res, valuemap)
110+
111+
112+
if __name__ == "__main__":
113+
unittest.main()

Orange/misc/collections.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import re
2+
from typing import List
3+
4+
15
class frozendict(dict):
26
def clear(self):
37
raise AttributeError("FrozenDict does not support method 'clear'")
@@ -20,3 +24,32 @@ def __setitem__(self, _key, _value):
2024
def __delitem__(self, _key):
2125
raise AttributeError("FrozenDict does not allow deleting elements")
2226

27+
28+
def natural_sorted(values: List) -> List:
29+
"""
30+
Sort values with natural sort or human order - [sth1, sth2, sth10] or
31+
[1, 2, 10]
32+
33+
Parameters
34+
----------
35+
values
36+
List with values to sort
37+
38+
Returns
39+
-------
40+
List with sorted values
41+
"""
42+
def atoi(text):
43+
return int(text) if text.isdigit() else text
44+
45+
def natural_keys(element):
46+
"""
47+
alist.sort(key=natural_keys) or sorted(alist, key=natural_keys) sorts
48+
in human order
49+
"""
50+
if isinstance(element, (str, bytes)):
51+
return [atoi(c) for c in re.split(r'(\d+)', element)]
52+
else:
53+
return element
54+
55+
return sorted(values, key=natural_keys)

Orange/misc/tests/test_collections.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unittest
22

3-
from Orange.misc.collections import frozendict
3+
from Orange.misc.collections import frozendict, natural_sorted
44

55

66
class TestFrozenDict(unittest.TestCase):
@@ -29,5 +29,37 @@ def test_functions_as_dict(self):
2929
self.assertEqual(set(d.items()), {("a", 12), ("b", 13)})
3030

3131

32+
class TestUtils(unittest.TestCase):
33+
def test_natural_sorted(self):
34+
data = [
35+
"something1",
36+
"something20",
37+
"something2",
38+
"something12"
39+
]
40+
res = [
41+
"something1",
42+
"something2",
43+
"something12",
44+
"something20"
45+
]
46+
self.assertListEqual(res, natural_sorted(data))
47+
48+
def test_natural_sorted_text(self):
49+
data = ["b", "aa", "c", "dd"]
50+
res = ["aa", "b", "c", "dd"]
51+
self.assertListEqual(res, natural_sorted(data))
52+
53+
def test_natural_sorted_numbers_str(self):
54+
data = ["1", "20", "2", "12"]
55+
res = ["1", "2", "12", "20"]
56+
self.assertListEqual(res, natural_sorted(data))
57+
58+
def test_natural_sorted_numbers(self):
59+
data = [1, 20, 2, 12]
60+
res = [1, 2, 12, 20]
61+
self.assertListEqual(res, natural_sorted(data))
62+
63+
3264
if __name__ == "__main__":
3365
unittest.main()

Orange/widgets/utils/domaineditor.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \
1212
TimeVariable, Domain
13+
from Orange.misc.collections import natural_sorted
1314
from Orange.data.util import get_unique_names_duplicates
1415
from Orange.statistics.util import unique
1516
from Orange.widgets import gui
@@ -326,7 +327,10 @@ def numbers_are_round(var, col_data):
326327
elif tpe == type(orig_var):
327328
var = orig_var.copy(name=new_name)
328329
elif tpe == DiscreteVariable:
329-
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
330+
values = natural_sorted(
331+
list(str(i) for i in unique(col_data)
332+
if not self._is_missing(i))
333+
)
330334
round_numbers = numbers_are_round(orig_var, col_data)
331335
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
332336
for x in self._iter_vals(col_data)]

0 commit comments

Comments
 (0)