|
| 1 | +import unittest |
| 2 | +import numpy as np |
| 3 | + |
| 4 | +from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable |
| 5 | +from Orange.data.io import guess_data_type |
| 6 | + |
| 7 | + |
| 8 | +class TestTableFilters(unittest.TestCase): |
| 9 | + |
| 10 | + def test_guess_data_type_continuous(self): |
| 11 | + # should be ContinuousVariable |
| 12 | + valuemap, values, coltype = guess_data_type(list(range(1, 100))) |
| 13 | + self.assertEqual(ContinuousVariable, coltype) |
| 14 | + self.assertIsNone(valuemap) |
| 15 | + np.testing.assert_array_equal(np.array(list(range(1, 100))), values) |
| 16 | + |
| 17 | + valuemap, values, coltype = guess_data_type([1, 2, 3, 1, 2, 3]) |
| 18 | + self.assertEqual(ContinuousVariable, coltype) |
| 19 | + self.assertIsNone(valuemap) |
| 20 | + np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values) |
| 21 | + |
| 22 | + valuemap, values, coltype = guess_data_type( |
| 23 | + ["1", "2", "3", "1", "2", "3"]) |
| 24 | + self.assertEqual(ContinuousVariable, coltype) |
| 25 | + self.assertIsNone(valuemap) |
| 26 | + np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values) |
| 27 | + |
| 28 | + def test_guess_data_type_discrete(self): |
| 29 | + # should be DiscreteVariable |
| 30 | + valuemap, values, coltype = guess_data_type([1, 2, 1, 2]) |
| 31 | + self.assertEqual(DiscreteVariable, coltype) |
| 32 | + self.assertEqual([1, 2], valuemap) |
| 33 | + np.testing.assert_array_equal([1, 2, 1, 2], values) |
| 34 | + |
| 35 | + valuemap, values, coltype = guess_data_type(["1", "2", "1", "2", "a"]) |
| 36 | + self.assertEqual(DiscreteVariable, coltype) |
| 37 | + self.assertEqual(["1", "2", "a"], valuemap) |
| 38 | + np.testing.assert_array_equal(['1', '2', '1', '2', 'a'], values) |
| 39 | + |
| 40 | + # just below the threshold for string variable |
| 41 | + in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76 |
| 42 | + valuemap, values, coltype = guess_data_type(in_values) |
| 43 | + self.assertEqual(DiscreteVariable, coltype) |
| 44 | + self.assertEqual(sorted(set(in_values)), valuemap) |
| 45 | + np.testing.assert_array_equal(in_values, values) |
| 46 | + |
| 47 | + def test_guess_data_type_string(self): |
| 48 | + # should be StringVariable |
| 49 | + # too many different values for discrete |
| 50 | + in_values = list(map(lambda x: str(x) + "a", range(90))) |
| 51 | + valuemap, values, coltype = guess_data_type(in_values) |
| 52 | + self.assertEqual(StringVariable, coltype) |
| 53 | + self.assertIsNone(valuemap) |
| 54 | + np.testing.assert_array_equal(in_values, values) |
| 55 | + |
| 56 | + # more than len(values)**0.7 |
| 57 | + in_values = list(map(lambda x: str(x) + "a", range(25))) + ["a"] * 75 |
| 58 | + valuemap, values, coltype = guess_data_type(in_values) |
| 59 | + self.assertEqual(StringVariable, coltype) |
| 60 | + self.assertIsNone(valuemap) |
| 61 | + np.testing.assert_array_equal(in_values, values) |
| 62 | + |
| 63 | + # more than 100 different values - exactly 101 |
| 64 | + # this is the case when len(values)**0.7 rule would vote for the |
| 65 | + # DiscreteVariable |
| 66 | + in_values = list(map(lambda x: str(x) + "a", range(100))) + ["a"] * 999 |
| 67 | + valuemap, values, coltype = guess_data_type(in_values) |
| 68 | + self.assertEqual(StringVariable, coltype) |
| 69 | + self.assertIsNone(valuemap) |
| 70 | + np.testing.assert_array_equal(in_values, values) |
0 commit comments