Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 86 additions & 59 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,80 @@ def _from_file(f):
return _from_file(filename)


def guess_data_type(orig_values):
"""
Use heuristics to guess data type.
"""
valuemap, values = [], orig_values
is_discrete = is_discrete_values(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
coltype = DiscreteVariable
else:
try:
values = [float(i) for i in orig_values]
except ValueError:
tvar = TimeVariable('_')
try:
values = [tvar.parse(i) for i in orig_values]
except ValueError:
coltype = StringVariable
else:
coltype = TimeVariable
else:
coltype = ContinuousVariable
return valuemap, values, coltype


def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
domain_vars, existing_var, new_var_name, data=None):
if valuemap:
# Map discrete data to ints
def valuemap_index(val):
try:
return valuemap.index(val)
except ValueError:
return np.nan

values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
coltype_kwargs.update(values=valuemap)

if coltype is StringVariable:
values = ['' if i is np.nan else i for i in orig_values]

var = None
if domain_vars is not None:
if existing_var:
# Use existing variable if available
var = coltype.make(existing_var.strip(), **coltype_kwargs)
else:
# Never use existing for un-named variables
var = coltype(new_var_name, **coltype_kwargs)

# Reorder discrete values to match existing variable
if var.is_discrete and not var.ordered:
new_order, old_order = var.values, coltype_kwargs.get('values',
var.values)
if new_order != old_order:
offset = len(new_order)
column = values if data.ndim > 1 else data
column += offset
for i, val in enumerate(var.values):
try:
oldval = old_order.index(val)
except ValueError:
continue
bn.replace(column, offset + oldval, new_order.index(val))

if isinstance(var, TimeVariable) or coltype is TimeVariable:
# Re-parse the values because only now after coltype.make call
# above, variable var is the correct one
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
values = [_var.parse(i) for i in orig_values]

return values, var


class Flags:
"""Parser for column flags (i.e. third header row)"""
DELIMITER = ' '
Expand Down Expand Up @@ -522,6 +596,7 @@ def _equal_length(lst):

elif (type_flag in DiscreteVariable.TYPE_HEADERS or
_RE_DISCRETE_LIST.match(type_flag)):
coltype = DiscreteVariable
if _RE_DISCRETE_LIST.match(type_flag):
valuemap = Flags.split(type_flag)
coltype_kwargs.update(ordered=True)
Expand All @@ -530,38 +605,7 @@ def _equal_length(lst):

else:
# No known type specified, use heuristics
is_discrete = is_discrete_values(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
else:
try:
values = [float(i) for i in orig_values]
except ValueError:
tvar = TimeVariable('_')
try:
values = [tvar.parse(i) for i in orig_values]
except ValueError:
coltype = StringVariable
else:
coltype = TimeVariable
else:
coltype = ContinuousVariable

if valuemap:
# Map discrete data to ints
def valuemap_index(val):
try:
return valuemap.index(val)
except ValueError:
return np.nan

values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
coltype = DiscreteVariable
coltype_kwargs.update(values=valuemap)

if coltype is StringVariable:
values = ['' if i is np.nan else i
for i in orig_values]
valuemap, values, coltype = guess_data_type(orig_values)

if flag.m or coltype is StringVariable:
append_to = (Mcols, metas)
Expand All @@ -574,37 +618,20 @@ def valuemap_index(val):

cols, domain_vars = append_to
cols.append(col)
var = None

existing_var, new_var_name, column = None, None, None
if domain_vars is not None:
existing_var = names and names[col]
if not existing_var:
new_var_name = next(NAMEGEN)

values, var = sanitize_variable(
valuemap, values, orig_values, coltype, coltype_kwargs,
domain_vars, existing_var, new_var_name, data)
if domain_vars is not None:
if names and names[col]:
# Use existing variable if available
var = coltype.make(names[col].strip(), **coltype_kwargs)
else:
# Never use existing for un-named variables
var = coltype(next(NAMEGEN), **coltype_kwargs)
var.attributes.update(flag.attributes)
domain_vars.append(var)

# Reorder discrete values to match existing variable
if var.is_discrete and not var.ordered:
new_order, old_order = var.values, coltype_kwargs.get('values', var.values)
if new_order != old_order:
offset = len(new_order)
column = values if data.ndim > 1 else data
column += offset
for i, val in enumerate(var.values):
try:
oldval = old_order.index(val)
except ValueError:
continue
bn.replace(column, offset + oldval, new_order.index(val))

if isinstance(var, TimeVariable) or coltype is TimeVariable:
# Re-parse the values because only now after coltype.make call
# above, variable var is the correct one
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
values = [_var.parse(i) for i in orig_values]

# Write back the changed data. This is needeed to pass the
# correct, converted values into Table.from_numpy below
try:
Expand Down
14 changes: 14 additions & 0 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import bottleneck as bn
from scipy import sparse as sp

import Orange.data # import for io.py
from Orange.data import (
_contingency, _valuecount,
Domain, Variable, Storage, StringVariable, Unknown, Value, Instance,
Expand Down Expand Up @@ -1453,6 +1454,7 @@ def transpose(cls, table, feature_names_column="",
feature names are mapped
:return: Table - transposed table
"""

self = cls()
n_cols, self.n_rows = table.X.shape
old_domain = table.attributes.get("old_domain")
Expand Down Expand Up @@ -1521,10 +1523,22 @@ def get_table_from_attributes_of_attributes(_vars, _dtype=float):
names = chain.from_iterable(list(attr.attributes)
for attr in table.domain.attributes)
names = sorted(set(names) - {var.name for var in class_vars})

def guessed_var(i, var_name):
orig_vals = M[:, i]
val_map, vals, var_type = Orange.data.io.guess_data_type(orig_vals)
values, variable = Orange.data.io.sanitize_variable(
val_map, vals, orig_vals, var_type,
{}, _metas, None, var_name)
M[:, i] = values
return variable

_metas = [StringVariable(n) for n in names]
if old_domain:
_metas = [m for m in old_domain.metas if m.name != meta_attr_name]
M = get_table_from_attributes_of_attributes(_metas, _dtype=object)
if not old_domain:
_metas = [guessed_var(i, m.name) for i, m in enumerate(_metas)]
if _metas:
self.metas = np.hstack((self.metas, M))
metas.extend(_metas)
Expand Down
59 changes: 45 additions & 14 deletions Orange/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2342,7 +2342,7 @@ def test_transpose_continuous_metas(self):
metas = [ContinuousVariable("m1")]
domain = Domain(attrs, metas=metas)
X = np.arange(8).reshape((4, 2))
M = np.array([0, 1, 0, 1])[:, None]
M = np.array([0.0, 1.0, 0.0, 1.0])[:, None]
data = Table(domain, X, metas=M)

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
Expand Down Expand Up @@ -2449,7 +2449,7 @@ def test_transpose_class_and_metas(self):
# original should not change
self.assertDictEqual(data.domain.attributes[0].attributes, {})

def test_transpose_attributes_of_attributes(self):
def test_transpose_attributes_of_attributes_discrete(self):
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
attrs[0].attributes = {"attr1": "a", "attr2": "aa"}
attrs[1].attributes = {"attr1": "b", "attr2": "bb"}
Expand All @@ -2458,11 +2458,12 @@ def test_transpose_attributes_of_attributes(self):

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
metas = [StringVariable("Feature name"), StringVariable("attr1"),
StringVariable("attr2")]
metas = [StringVariable("Feature name"),
DiscreteVariable("attr1", values=("a", "b")),
DiscreteVariable("attr2", values=("aa", "bb"))]
domain = Domain(att, metas=metas)
result = Table(domain, np.arange(8).reshape((4, 2)).T,
metas=np.array([["c1", "a", "aa"], ["c2", "b", "bb"]]))
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, 1.0]], dtype=object)
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)

# transpose and compare
self._compare_tables(result, Table.transpose(data))
Expand All @@ -2475,6 +2476,33 @@ def test_transpose_attributes_of_attributes(self):
self.assertDictEqual(data.domain.attributes[0].attributes,
{"attr1": "a", "attr2": "aa"})

def test_transpose_attributes_of_attributes_continuous(self):
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
attrs[0].attributes = {"attr1": "1.100", "attr2": "1.300"}
attrs[1].attributes = {"attr1": "2.200", "attr2": "2.300"}
domain = Domain(attrs)
data = Table(domain, np.arange(8).reshape((4, 2)))

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
metas = [StringVariable("Feature name"), ContinuousVariable("attr1"),
ContinuousVariable("attr2")]
domain = Domain(att, metas=metas)
result = Table(domain, np.arange(8).reshape((4, 2)).T,
metas=np.array([["c1", 1.1, 1.3],
["c2", 2.2, 2.3]], dtype=object))

# transpose and compare
self._compare_tables(result, Table.transpose(data))

# transpose of transpose
t = Table.transpose(Table.transpose(data), "Feature name")
self._compare_tables(data, t)

# original should not change
self.assertDictEqual(data.domain.attributes[0].attributes,
{"attr1": "1.100", "attr2": "1.300"})

def test_transpose_attributes_of_attributes_missings(self):
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
attrs[0].attributes = {"attr1": "a", "attr2": "aa"}
Expand All @@ -2484,11 +2512,12 @@ def test_transpose_attributes_of_attributes_missings(self):

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
metas = [StringVariable("Feature name"), StringVariable("attr1"),
StringVariable("attr2")]
metas = [StringVariable("Feature name"),
DiscreteVariable("attr1", values=("a", "b")),
DiscreteVariable("attr2", values=("aa",))]
domain = Domain(att, metas=metas)
result = Table(domain, np.arange(8).reshape((4, 2)).T,
metas=np.array([["c1", "a", "aa"], ["c2", "b", ""]]))
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, np.nan]], dtype=object)
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)

# transpose and compare
self._compare_tables(result, Table.transpose(data))
Expand Down Expand Up @@ -2517,10 +2546,11 @@ def test_transpose_class_metas_attributes(self):
att[1].attributes = {"cls": "2.000", "m1": "bb", "m2": "bbb"}
att[2].attributes = {"cls": "3.000", "m1": "cc", "m2": "ccc"}
att[3].attributes = {"cls": "4.000", "m1": "dd", "m2": "ddd"}
metas = [StringVariable("Feature name"), StringVariable("attr1"),
StringVariable("attr2")]
metas = [StringVariable("Feature name"),
DiscreteVariable("attr1", values=("a1", "b1")),
DiscreteVariable("attr2", values=("aa1", "bb1"))]
domain = Domain(att, metas=metas)
M = np.array([["c1", "a1", "aa1"], ["c2", "b1", "bb1"]])
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, 1.0]], dtype=object)
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)

# transpose and compare
Expand All @@ -2546,7 +2576,8 @@ def _compare_tables(self, table1, table2):
self.assertEqual(table1.n_rows, table2.n_rows)
np.testing.assert_array_equal(table1.X, table2.X)
np.testing.assert_array_equal(table1.Y, table2.Y)
np.testing.assert_array_equal(table1.metas, table2.metas)
np.testing.assert_array_equal(table1.metas.astype(str),
table2.metas.astype(str))
np.testing.assert_array_equal(table1.W, table2.W)

self.assertEqual([(type(x), x.name, x.attributes)
Expand Down