Skip to content

Commit aa1ab0b

Browse files
authored
Merge pull request #1844 from VesnaT/transpose_guess_type
[ENH] Table.transpose: Use heuristic to guess data type of attributes of attributes
2 parents e380ca2 + ff2cdaa commit aa1ab0b

File tree

3 files changed

+145
-73
lines changed

3 files changed

+145
-73
lines changed

Orange/data/io.py

Lines changed: 86 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,80 @@ def _from_file(f):
101101
return _from_file(filename)
102102

103103

104+
def guess_data_type(orig_values):
105+
"""
106+
Use heuristics to guess data type.
107+
"""
108+
valuemap, values = [], orig_values
109+
is_discrete = is_discrete_values(orig_values)
110+
if is_discrete:
111+
valuemap = sorted(is_discrete)
112+
coltype = DiscreteVariable
113+
else:
114+
try:
115+
values = [float(i) for i in orig_values]
116+
except ValueError:
117+
tvar = TimeVariable('_')
118+
try:
119+
values = [tvar.parse(i) for i in orig_values]
120+
except ValueError:
121+
coltype = StringVariable
122+
else:
123+
coltype = TimeVariable
124+
else:
125+
coltype = ContinuousVariable
126+
return valuemap, values, coltype
127+
128+
129+
def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
130+
domain_vars, existing_var, new_var_name, data=None):
131+
if valuemap:
132+
# Map discrete data to ints
133+
def valuemap_index(val):
134+
try:
135+
return valuemap.index(val)
136+
except ValueError:
137+
return np.nan
138+
139+
values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
140+
coltype_kwargs.update(values=valuemap)
141+
142+
if coltype is StringVariable:
143+
values = ['' if i is np.nan else i for i in orig_values]
144+
145+
var = None
146+
if domain_vars is not None:
147+
if existing_var:
148+
# Use existing variable if available
149+
var = coltype.make(existing_var.strip(), **coltype_kwargs)
150+
else:
151+
# Never use existing for un-named variables
152+
var = coltype(new_var_name, **coltype_kwargs)
153+
154+
# Reorder discrete values to match existing variable
155+
if var.is_discrete and not var.ordered:
156+
new_order, old_order = var.values, coltype_kwargs.get('values',
157+
var.values)
158+
if new_order != old_order:
159+
offset = len(new_order)
160+
column = values if data.ndim > 1 else data
161+
column += offset
162+
for i, val in enumerate(var.values):
163+
try:
164+
oldval = old_order.index(val)
165+
except ValueError:
166+
continue
167+
bn.replace(column, offset + oldval, new_order.index(val))
168+
169+
if isinstance(var, TimeVariable) or coltype is TimeVariable:
170+
# Re-parse the values because only now after coltype.make call
171+
# above, variable var is the correct one
172+
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
173+
values = [_var.parse(i) for i in orig_values]
174+
175+
return values, var
176+
177+
104178
class Flags:
105179
"""Parser for column flags (i.e. third header row)"""
106180
DELIMITER = ' '
@@ -522,6 +596,7 @@ def _equal_length(lst):
522596

523597
elif (type_flag in DiscreteVariable.TYPE_HEADERS or
524598
_RE_DISCRETE_LIST.match(type_flag)):
599+
coltype = DiscreteVariable
525600
if _RE_DISCRETE_LIST.match(type_flag):
526601
valuemap = Flags.split(type_flag)
527602
coltype_kwargs.update(ordered=True)
@@ -530,38 +605,7 @@ def _equal_length(lst):
530605

531606
else:
532607
# No known type specified, use heuristics
533-
is_discrete = is_discrete_values(orig_values)
534-
if is_discrete:
535-
valuemap = sorted(is_discrete)
536-
else:
537-
try:
538-
values = [float(i) for i in orig_values]
539-
except ValueError:
540-
tvar = TimeVariable('_')
541-
try:
542-
values = [tvar.parse(i) for i in orig_values]
543-
except ValueError:
544-
coltype = StringVariable
545-
else:
546-
coltype = TimeVariable
547-
else:
548-
coltype = ContinuousVariable
549-
550-
if valuemap:
551-
# Map discrete data to ints
552-
def valuemap_index(val):
553-
try:
554-
return valuemap.index(val)
555-
except ValueError:
556-
return np.nan
557-
558-
values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
559-
coltype = DiscreteVariable
560-
coltype_kwargs.update(values=valuemap)
561-
562-
if coltype is StringVariable:
563-
values = ['' if i is np.nan else i
564-
for i in orig_values]
608+
valuemap, values, coltype = guess_data_type(orig_values)
565609

566610
if flag.m or coltype is StringVariable:
567611
append_to = (Mcols, metas)
@@ -574,37 +618,20 @@ def valuemap_index(val):
574618

575619
cols, domain_vars = append_to
576620
cols.append(col)
577-
var = None
621+
622+
existing_var, new_var_name, column = None, None, None
623+
if domain_vars is not None:
624+
existing_var = names and names[col]
625+
if not existing_var:
626+
new_var_name = next(NAMEGEN)
627+
628+
values, var = sanitize_variable(
629+
valuemap, values, orig_values, coltype, coltype_kwargs,
630+
domain_vars, existing_var, new_var_name, data)
578631
if domain_vars is not None:
579-
if names and names[col]:
580-
# Use existing variable if available
581-
var = coltype.make(names[col].strip(), **coltype_kwargs)
582-
else:
583-
# Never use existing for un-named variables
584-
var = coltype(next(NAMEGEN), **coltype_kwargs)
585632
var.attributes.update(flag.attributes)
586633
domain_vars.append(var)
587634

588-
# Reorder discrete values to match existing variable
589-
if var.is_discrete and not var.ordered:
590-
new_order, old_order = var.values, coltype_kwargs.get('values', var.values)
591-
if new_order != old_order:
592-
offset = len(new_order)
593-
column = values if data.ndim > 1 else data
594-
column += offset
595-
for i, val in enumerate(var.values):
596-
try:
597-
oldval = old_order.index(val)
598-
except ValueError:
599-
continue
600-
bn.replace(column, offset + oldval, new_order.index(val))
601-
602-
if isinstance(var, TimeVariable) or coltype is TimeVariable:
603-
# Re-parse the values because only now after coltype.make call
604-
# above, variable var is the correct one
605-
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
606-
values = [_var.parse(i) for i in orig_values]
607-
608635
# Write back the changed data. This is needeed to pass the
609636
# correct, converted values into Table.from_numpy below
610637
try:

Orange/data/table.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import bottleneck as bn
1414
from scipy import sparse as sp
1515

16+
import Orange.data # import for io.py
1617
from Orange.data import (
1718
_contingency, _valuecount,
1819
Domain, Variable, Storage, StringVariable, Unknown, Value, Instance,
@@ -1453,6 +1454,7 @@ def transpose(cls, table, feature_names_column="",
14531454
feature names are mapped
14541455
:return: Table - transposed table
14551456
"""
1457+
14561458
self = cls()
14571459
n_cols, self.n_rows = table.X.shape
14581460
old_domain = table.attributes.get("old_domain")
@@ -1522,10 +1524,22 @@ def get_table_from_attributes_of_attributes(_vars, _dtype=float):
15221524
names = chain.from_iterable(list(attr.attributes)
15231525
for attr in table.domain.attributes)
15241526
names = sorted(set(names) - {var.name for var in class_vars})
1527+
1528+
def guessed_var(i, var_name):
1529+
orig_vals = M[:, i]
1530+
val_map, vals, var_type = Orange.data.io.guess_data_type(orig_vals)
1531+
values, variable = Orange.data.io.sanitize_variable(
1532+
val_map, vals, orig_vals, var_type,
1533+
{}, _metas, None, var_name)
1534+
M[:, i] = values
1535+
return variable
1536+
15251537
_metas = [StringVariable(n) for n in names]
15261538
if old_domain:
15271539
_metas = [m for m in old_domain.metas if m.name != meta_attr_name]
15281540
M = get_table_from_attributes_of_attributes(_metas, _dtype=object)
1541+
if not old_domain:
1542+
_metas = [guessed_var(i, m.name) for i, m in enumerate(_metas)]
15291543
if _metas:
15301544
self.metas = np.hstack((self.metas, M))
15311545
metas.extend(_metas)

Orange/tests/test_table.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2342,7 +2342,7 @@ def test_transpose_continuous_metas(self):
23422342
metas = [ContinuousVariable("m1")]
23432343
domain = Domain(attrs, metas=metas)
23442344
X = np.arange(8).reshape((4, 2))
2345-
M = np.array([0, 1, 0, 1])[:, None]
2345+
M = np.array([0.0, 1.0, 0.0, 1.0])[:, None]
23462346
data = Table(domain, X, metas=M)
23472347

23482348
att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
@@ -2449,7 +2449,7 @@ def test_transpose_class_and_metas(self):
24492449
# original should not change
24502450
self.assertDictEqual(data.domain.attributes[0].attributes, {})
24512451

2452-
def test_transpose_attributes_of_attributes(self):
2452+
def test_transpose_attributes_of_attributes_discrete(self):
24532453
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
24542454
attrs[0].attributes = {"attr1": "a", "attr2": "aa"}
24552455
attrs[1].attributes = {"attr1": "b", "attr2": "bb"}
@@ -2458,11 +2458,12 @@ def test_transpose_attributes_of_attributes(self):
24582458

24592459
att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
24602460
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
2461-
metas = [StringVariable("Feature name"), StringVariable("attr1"),
2462-
StringVariable("attr2")]
2461+
metas = [StringVariable("Feature name"),
2462+
DiscreteVariable("attr1", values=("a", "b")),
2463+
DiscreteVariable("attr2", values=("aa", "bb"))]
24632464
domain = Domain(att, metas=metas)
2464-
result = Table(domain, np.arange(8).reshape((4, 2)).T,
2465-
metas=np.array([["c1", "a", "aa"], ["c2", "b", "bb"]]))
2465+
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, 1.0]], dtype=object)
2466+
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)
24662467

24672468
# transpose and compare
24682469
self._compare_tables(result, Table.transpose(data))
@@ -2475,6 +2476,33 @@ def test_transpose_attributes_of_attributes(self):
24752476
self.assertDictEqual(data.domain.attributes[0].attributes,
24762477
{"attr1": "a", "attr2": "aa"})
24772478

2479+
def test_transpose_attributes_of_attributes_continuous(self):
2480+
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
2481+
attrs[0].attributes = {"attr1": "1.100", "attr2": "1.300"}
2482+
attrs[1].attributes = {"attr1": "2.200", "attr2": "2.300"}
2483+
domain = Domain(attrs)
2484+
data = Table(domain, np.arange(8).reshape((4, 2)))
2485+
2486+
att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
2487+
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
2488+
metas = [StringVariable("Feature name"), ContinuousVariable("attr1"),
2489+
ContinuousVariable("attr2")]
2490+
domain = Domain(att, metas=metas)
2491+
result = Table(domain, np.arange(8).reshape((4, 2)).T,
2492+
metas=np.array([["c1", 1.1, 1.3],
2493+
["c2", 2.2, 2.3]], dtype=object))
2494+
2495+
# transpose and compare
2496+
self._compare_tables(result, Table.transpose(data))
2497+
2498+
# transpose of transpose
2499+
t = Table.transpose(Table.transpose(data), "Feature name")
2500+
self._compare_tables(data, t)
2501+
2502+
# original should not change
2503+
self.assertDictEqual(data.domain.attributes[0].attributes,
2504+
{"attr1": "1.100", "attr2": "1.300"})
2505+
24782506
def test_transpose_attributes_of_attributes_missings(self):
24792507
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
24802508
attrs[0].attributes = {"attr1": "a", "attr2": "aa"}
@@ -2484,11 +2512,12 @@ def test_transpose_attributes_of_attributes_missings(self):
24842512

24852513
att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
24862514
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
2487-
metas = [StringVariable("Feature name"), StringVariable("attr1"),
2488-
StringVariable("attr2")]
2515+
metas = [StringVariable("Feature name"),
2516+
DiscreteVariable("attr1", values=("a", "b")),
2517+
DiscreteVariable("attr2", values=("aa",))]
24892518
domain = Domain(att, metas=metas)
2490-
result = Table(domain, np.arange(8).reshape((4, 2)).T,
2491-
metas=np.array([["c1", "a", "aa"], ["c2", "b", ""]]))
2519+
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, np.nan]], dtype=object)
2520+
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)
24922521

24932522
# transpose and compare
24942523
self._compare_tables(result, Table.transpose(data))
@@ -2517,10 +2546,11 @@ def test_transpose_class_metas_attributes(self):
25172546
att[1].attributes = {"cls": "2.000", "m1": "bb", "m2": "bbb"}
25182547
att[2].attributes = {"cls": "3.000", "m1": "cc", "m2": "ccc"}
25192548
att[3].attributes = {"cls": "4.000", "m1": "dd", "m2": "ddd"}
2520-
metas = [StringVariable("Feature name"), StringVariable("attr1"),
2521-
StringVariable("attr2")]
2549+
metas = [StringVariable("Feature name"),
2550+
DiscreteVariable("attr1", values=("a1", "b1")),
2551+
DiscreteVariable("attr2", values=("aa1", "bb1"))]
25222552
domain = Domain(att, metas=metas)
2523-
M = np.array([["c1", "a1", "aa1"], ["c2", "b1", "bb1"]])
2553+
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, 1.0]], dtype=object)
25242554
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)
25252555

25262556
# transpose and compare
@@ -2546,7 +2576,8 @@ def _compare_tables(self, table1, table2):
25462576
self.assertEqual(table1.n_rows, table2.n_rows)
25472577
np.testing.assert_array_equal(table1.X, table2.X)
25482578
np.testing.assert_array_equal(table1.Y, table2.Y)
2549-
np.testing.assert_array_equal(table1.metas, table2.metas)
2579+
np.testing.assert_array_equal(table1.metas.astype(str),
2580+
table2.metas.astype(str))
25502581
np.testing.assert_array_equal(table1.W, table2.W)
25512582

25522583
self.assertEqual([(type(x), x.name, x.attributes)

0 commit comments

Comments
 (0)