Skip to content

Commit 626b4d3

Browse files
committed
Sort values naturally when reading files
1 parent 56a106f commit 626b4d3

File tree

4 files changed

+93
-6
lines changed

4 files changed

+93
-6
lines changed

Orange/data/io_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from Orange.data import Table, Domain, Variable, DiscreteVariable, \
2020
StringVariable, ContinuousVariable, TimeVariable
2121
from Orange.data.io_util import Compression, open_compressed, \
22-
isnastr, guess_data_type, sanitize_variable
22+
isnastr, guess_data_type, sanitize_variable, natural_sorted
2323
from Orange.data.util import get_unique_names_duplicates
2424
from Orange.data.variable import VariableMeta
2525
from Orange.util import Registry, flatten, namegen
@@ -278,7 +278,7 @@ def _disc_column(data: np.ndarray, col: int) -> \
278278
def _disc_no_vals_column(data: np.ndarray, col: int, **_) -> \
279279
_ColumnProperties:
280280
vals, coltype = _TableBuilder._disc_column(data, col)
281-
return _ColumnProperties(valuemap=sorted(set(vals) - {""}),
281+
return _ColumnProperties(valuemap=natural_sorted(set(vals) - {""}),
282282
values=vals, coltype=coltype,
283283
orig_values=vals)
284284

Orange/data/io_util.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import subprocess
2+
import re
23
from collections import defaultdict
4+
from typing import List
35

46
import numpy as np
57
from chardet.universaldetector import UniversalDetector
@@ -111,6 +113,36 @@ def isnastr(arr, out=None):
111113
return __isnastr(arr, out=out)
112114

113115

116+
def natural_sorted(values: List) -> List:
117+
"""
118+
Sort values with natural sort or human order - [sth1, sth2, sth10] or
119+
[1, 2, 10]
120+
121+
Parameters
122+
----------
123+
values
124+
List with values to sort
125+
126+
Returns
127+
-------
128+
List with sorted values
129+
"""
130+
def atoi(text):
131+
return int(text) if text.isdigit() else text
132+
133+
def natural_keys(element):
134+
"""
135+
alist.sort(key=natural_keys) or sorted(alist, key=natural_keys) sorts
136+
in human order
137+
"""
138+
if isinstance(element, (str, bytes)):
139+
return [atoi(c) for c in re.split(r'(\d+)', element)]
140+
else:
141+
return element
142+
143+
return sorted(values, key=natural_keys)
144+
145+
114146
def guess_data_type(orig_values, namask=None):
115147
"""
116148
Use heuristics to guess data type.
@@ -121,7 +153,7 @@ def guess_data_type(orig_values, namask=None):
121153
if namask is None:
122154
namask = isnastr(orig_values)
123155
if is_discrete:
124-
valuemap = sorted(is_discrete)
156+
valuemap = natural_sorted(is_discrete)
125157
coltype = DiscreteVariable
126158
else:
127159
# try to parse as float

Orange/data/tests/test_io.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
55
TimeVariable
6-
from Orange.data.io_util import guess_data_type
6+
from Orange.data.io_util import guess_data_type, natural_sorted
77

88

99
class TestTableFilters(unittest.TestCase):
@@ -42,7 +42,7 @@ def test_guess_data_type_discrete(self):
4242
in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76
4343
valuemap, values, coltype = guess_data_type(in_values)
4444
self.assertEqual(DiscreteVariable, coltype)
45-
self.assertEqual(sorted(set(in_values)), valuemap)
45+
self.assertEqual(natural_sorted(set(in_values)), valuemap)
4646
np.testing.assert_array_equal(in_values, values)
4747

4848
def test_guess_data_type_string(self):
@@ -93,3 +93,54 @@ def test_guess_data_type_time(self):
9393
valuemap, _, coltype = guess_data_type(in_values)
9494
self.assertEqual(TimeVariable, coltype)
9595
self.assertIsNone(valuemap)
96+
97+
def test_guess_data_type_values_order(self):
98+
"""
99+
Test if values are ordered naturally
100+
"""
101+
in_values = [
102+
"something1", "something12", "something2", "something1",
103+
"something20", "something1", "something2", "something12",
104+
"something1", "something12"
105+
]
106+
res = ["something1", "something2", "something12", "something20"]
107+
valuemap, _, coltype = guess_data_type(in_values)
108+
self.assertEqual(DiscreteVariable, coltype)
109+
self.assertListEqual(res, valuemap)
110+
111+
112+
class TestUtils(unittest.TestCase):
113+
114+
def test_natural_sorted(self):
115+
data = [
116+
"something1",
117+
"something20",
118+
"something2",
119+
"something12"
120+
]
121+
res = [
122+
"something1",
123+
"something2",
124+
"something12",
125+
"something20"
126+
]
127+
self.assertListEqual(res, natural_sorted(data))
128+
129+
def test_natural_sorted_text(self):
130+
data = ["b", "aa", "c", "dd"]
131+
res = ["aa", "b", "c", "dd"]
132+
self.assertListEqual(res, natural_sorted(data))
133+
134+
def test_natural_sorted_numbers_str(self):
135+
data = ["1", "20", "2", "12"]
136+
res = ["1", "2", "12", "20"]
137+
self.assertListEqual(res, natural_sorted(data))
138+
139+
def test_natural_sorted_numbers(self):
140+
data = [1, 20, 2, 12]
141+
res = [1, 2, 12, 20]
142+
self.assertListEqual(res, natural_sorted(data))
143+
144+
145+
if __name__ == "__main__":
146+
unittest.main()

Orange/widgets/utils/domaineditor.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \
1212
TimeVariable, Domain
13+
from Orange.data.io_util import natural_sorted
1314
from Orange.data.util import get_unique_names_duplicates
1415
from Orange.statistics.util import unique
1516
from Orange.widgets import gui
@@ -326,7 +327,10 @@ def numbers_are_round(var, col_data):
326327
elif tpe == type(orig_var):
327328
var = orig_var.copy(name=new_name)
328329
elif tpe == DiscreteVariable:
329-
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
330+
values = natural_sorted(
331+
list(str(i) for i in unique(col_data)
332+
if not self._is_missing(i))
333+
)
330334
round_numbers = numbers_are_round(orig_var, col_data)
331335
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
332336
for x in self._iter_vals(col_data)]

0 commit comments

Comments
 (0)