diff --git a/Orange/data/io_base.py b/Orange/data/io_base.py index 71ae66295e9..6d3510992bc 100644 --- a/Orange/data/io_base.py +++ b/Orange/data/io_base.py @@ -22,6 +22,7 @@ isnastr, guess_data_type, sanitize_variable from Orange.data.util import get_unique_names_duplicates from Orange.data.variable import VariableMeta +from Orange.misc.collections import natural_sorted from Orange.util import Registry, flatten, namegen __all__ = ["FileFormatBase", "Flags", "DataTableMixin", "PICKLE_PROTOCOL"] @@ -278,7 +279,7 @@ def _disc_column(data: np.ndarray, col: int) -> \ def _disc_no_vals_column(data: np.ndarray, col: int, **_) -> \ _ColumnProperties: vals, coltype = _TableBuilder._disc_column(data, col) - return _ColumnProperties(valuemap=sorted(set(vals) - {""}), + return _ColumnProperties(valuemap=natural_sorted(set(vals) - {""}), values=vals, coltype=coltype, orig_values=vals) diff --git a/Orange/data/io_util.py b/Orange/data/io_util.py index f79cbda04e0..b14d8c93f71 100644 --- a/Orange/data/io_util.py +++ b/Orange/data/io_util.py @@ -8,6 +8,7 @@ is_discrete_values, MISSING_VALUES, Variable, DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable, ) +from Orange.misc.collections import natural_sorted __all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr", "guess_data_type", "sanitize_variable"] @@ -121,7 +122,7 @@ def guess_data_type(orig_values, namask=None): if namask is None: namask = isnastr(orig_values) if is_discrete: - valuemap = sorted(is_discrete) + valuemap = natural_sorted(is_discrete) coltype = DiscreteVariable else: # try to parse as float diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index b4653f5401d..a4beac61ac6 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -4,10 +4,10 @@ from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \ TimeVariable from Orange.data.io_util import guess_data_type +from Orange.misc.collections import natural_sorted class TestTableFilters(unittest.TestCase): - def test_guess_data_type_continuous(self): # should be ContinuousVariable valuemap, values, coltype = guess_data_type(list(range(1, 100))) @@ -42,7 +42,7 @@ def test_guess_data_type_discrete(self): in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76 valuemap, values, coltype = guess_data_type(in_values) self.assertEqual(DiscreteVariable, coltype) - self.assertEqual(sorted(set(in_values)), valuemap) + self.assertEqual(natural_sorted(set(in_values)), valuemap) np.testing.assert_array_equal(in_values, values) def test_guess_data_type_string(self): @@ -93,3 +93,21 @@ def test_guess_data_type_time(self): valuemap, _, coltype = guess_data_type(in_values) self.assertEqual(TimeVariable, coltype) self.assertIsNone(valuemap) + + def test_guess_data_type_values_order(self): + """ + Test if values are ordered naturally + """ + in_values = [ + "something1", "something12", "something2", "something1", + "something20", "something1", "something2", "something12", + "something1", "something12" + ] + res = ["something1", "something2", "something12", "something20"] + valuemap, _, coltype = guess_data_type(in_values) + self.assertEqual(DiscreteVariable, coltype) + self.assertListEqual(res, valuemap) + + +if __name__ == "__main__": + unittest.main() diff --git a/Orange/misc/collections.py b/Orange/misc/collections.py index 09fd7ee9b0f..0f536cf7170 100644 --- a/Orange/misc/collections.py +++ b/Orange/misc/collections.py @@ -1,3 +1,7 @@ +import re +from typing import List + + class frozendict(dict): def clear(self): raise AttributeError("FrozenDict does not support method 'clear'") @@ -20,3 +24,32 @@ def __setitem__(self, _key, _value): def __delitem__(self, _key): raise AttributeError("FrozenDict does not allow deleting elements") + +def natural_sorted(values: List) -> List: + """ + Sort values with natural sort or human order - [sth1, sth2, sth10] or + [1, 2, 10] + + Parameters + ---------- + values + List with values to sort + + Returns + ------- + List with sorted values + """ + def atoi(text): + return int(text) if text.isdigit() else text + + def natural_keys(element): + """ + alist.sort(key=natural_keys) or sorted(alist, key=natural_keys) sorts + in human order + """ + if isinstance(element, (str, bytes)): + return [atoi(c) for c in re.split(r'(\d+)', element)] + else: + return element + + return sorted(values, key=natural_keys) diff --git a/Orange/misc/tests/test_collections.py b/Orange/misc/tests/test_collections.py index 0e87daad837..9769cb61424 100644 --- a/Orange/misc/tests/test_collections.py +++ b/Orange/misc/tests/test_collections.py @@ -1,6 +1,6 @@ import unittest -from Orange.misc.collections import frozendict +from Orange.misc.collections import frozendict, natural_sorted class TestFrozenDict(unittest.TestCase): @@ -29,5 +29,37 @@ def test_functions_as_dict(self): self.assertEqual(set(d.items()), {("a", 12), ("b", 13)}) +class TestUtils(unittest.TestCase): + def test_natural_sorted(self): + data = [ + "something1", + "something20", + "something2", + "something12" + ] + res = [ + "something1", + "something2", + "something12", + "something20" + ] + self.assertListEqual(res, natural_sorted(data)) + + def test_natural_sorted_text(self): + data = ["b", "aa", "c", "dd"] + res = ["aa", "b", "c", "dd"] + self.assertListEqual(res, natural_sorted(data)) + + def test_natural_sorted_numbers_str(self): + data = ["1", "20", "2", "12"] + res = ["1", "2", "12", "20"] + self.assertListEqual(res, natural_sorted(data)) + + def test_natural_sorted_numbers(self): + data = [1, 20, 2, 12] + res = [1, 2, 12, 20] + self.assertListEqual(res, natural_sorted(data)) + + if __name__ == "__main__": unittest.main() diff --git a/Orange/widgets/utils/domaineditor.py b/Orange/widgets/utils/domaineditor.py index 1b7a47cb7dd..3e84d3c5691 100644 --- a/Orange/widgets/utils/domaineditor.py +++ b/Orange/widgets/utils/domaineditor.py @@ -10,6 +10,7 @@ from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \ TimeVariable, Domain +from Orange.misc.collections import natural_sorted from Orange.data.util import get_unique_names_duplicates from Orange.statistics.util import unique from Orange.widgets import gui @@ -326,7 +327,10 @@ def numbers_are_round(var, col_data): elif tpe == type(orig_var): var = orig_var.copy(name=new_name) elif tpe == DiscreteVariable: - values = list(str(i) for i in unique(col_data) if not self._is_missing(i)) + values = natural_sorted( + list(str(i) for i in unique(col_data) + if not self._is_missing(i)) + ) round_numbers = numbers_are_round(orig_var, col_data) col_data = [np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data)]