Skip to content

Commit 0e4b7cf

Browse files
committed
io: Refactor to make guessing type heuristic reusable
1 parent 75125ad commit 0e4b7cf

File tree

1 file changed

+86
-59
lines changed

1 file changed

+86
-59
lines changed

Orange/data/io.py

Lines changed: 86 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,80 @@ def _from_file(f):
101101
return _from_file(filename)
102102

103103

104+
def guess_data_type(orig_values):
105+
"""
106+
Use heuristics to guess data type.
107+
"""
108+
valuemap, values = [], orig_values
109+
is_discrete = is_discrete_values(orig_values)
110+
if is_discrete:
111+
valuemap = sorted(is_discrete)
112+
coltype = DiscreteVariable
113+
else:
114+
try:
115+
values = [float(i) for i in orig_values]
116+
except ValueError:
117+
tvar = TimeVariable('_')
118+
try:
119+
values = [tvar.parse(i) for i in orig_values]
120+
except ValueError:
121+
coltype = StringVariable
122+
else:
123+
coltype = TimeVariable
124+
else:
125+
coltype = ContinuousVariable
126+
return valuemap, values, coltype
127+
128+
129+
def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
130+
domain_vars, existing_var, new_var_name, data=None):
131+
if valuemap:
132+
# Map discrete data to ints
133+
def valuemap_index(val):
134+
try:
135+
return valuemap.index(val)
136+
except ValueError:
137+
return np.nan
138+
139+
values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
140+
coltype_kwargs.update(values=valuemap)
141+
142+
if coltype is StringVariable:
143+
values = ['' if i is np.nan else i for i in orig_values]
144+
145+
var = None
146+
if domain_vars is not None:
147+
if existing_var:
148+
# Use existing variable if available
149+
var = coltype.make(existing_var.strip(), **coltype_kwargs)
150+
else:
151+
# Never use existing for un-named variables
152+
var = coltype(new_var_name, **coltype_kwargs)
153+
154+
# Reorder discrete values to match existing variable
155+
if var.is_discrete and not var.ordered:
156+
new_order, old_order = var.values, coltype_kwargs.get('values',
157+
var.values)
158+
if new_order != old_order:
159+
offset = len(new_order)
160+
column = values if data.ndim > 1 else data
161+
column += offset
162+
for i, val in enumerate(var.values):
163+
try:
164+
oldval = old_order.index(val)
165+
except ValueError:
166+
continue
167+
bn.replace(column, offset + oldval, new_order.index(val))
168+
169+
if isinstance(var, TimeVariable) or coltype is TimeVariable:
170+
# Re-parse the values because only now after coltype.make call
171+
# above, variable var is the correct one
172+
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
173+
values = [_var.parse(i) for i in orig_values]
174+
175+
return values, var
176+
177+
104178
class Flags:
105179
"""Parser for column flags (i.e. third header row)"""
106180
DELIMITER = ' '
@@ -522,6 +596,7 @@ def _equal_length(lst):
522596

523597
elif (type_flag in DiscreteVariable.TYPE_HEADERS or
524598
_RE_DISCRETE_LIST.match(type_flag)):
599+
coltype = DiscreteVariable
525600
if _RE_DISCRETE_LIST.match(type_flag):
526601
valuemap = Flags.split(type_flag)
527602
coltype_kwargs.update(ordered=True)
@@ -530,38 +605,7 @@ def _equal_length(lst):
530605

531606
else:
532607
# No known type specified, use heuristics
533-
is_discrete = is_discrete_values(orig_values)
534-
if is_discrete:
535-
valuemap = sorted(is_discrete)
536-
else:
537-
try:
538-
values = [float(i) for i in orig_values]
539-
except ValueError:
540-
tvar = TimeVariable('_')
541-
try:
542-
values = [tvar.parse(i) for i in orig_values]
543-
except ValueError:
544-
coltype = StringVariable
545-
else:
546-
coltype = TimeVariable
547-
else:
548-
coltype = ContinuousVariable
549-
550-
if valuemap:
551-
# Map discrete data to ints
552-
def valuemap_index(val):
553-
try:
554-
return valuemap.index(val)
555-
except ValueError:
556-
return np.nan
557-
558-
values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
559-
coltype = DiscreteVariable
560-
coltype_kwargs.update(values=valuemap)
561-
562-
if coltype is StringVariable:
563-
values = ['' if i is np.nan else i
564-
for i in orig_values]
608+
valuemap, values, coltype = guess_data_type(orig_values)
565609

566610
if flag.m or coltype is StringVariable:
567611
append_to = (Mcols, metas)
@@ -574,37 +618,20 @@ def valuemap_index(val):
574618

575619
cols, domain_vars = append_to
576620
cols.append(col)
577-
var = None
621+
622+
existing_var, new_var_name, column = None, None, None
623+
if domain_vars is not None:
624+
existing_var = names and names[col]
625+
if not existing_var:
626+
new_var_name = next(NAMEGEN)
627+
628+
values, var = sanitize_variable(
629+
valuemap, values, orig_values, coltype, coltype_kwargs,
630+
domain_vars, existing_var, new_var_name, data)
578631
if domain_vars is not None:
579-
if names and names[col]:
580-
# Use existing variable if available
581-
var = coltype.make(names[col].strip(), **coltype_kwargs)
582-
else:
583-
# Never use existing for un-named variables
584-
var = coltype(next(NAMEGEN), **coltype_kwargs)
585632
var.attributes.update(flag.attributes)
586633
domain_vars.append(var)
587634

588-
# Reorder discrete values to match existing variable
589-
if var.is_discrete and not var.ordered:
590-
new_order, old_order = var.values, coltype_kwargs.get('values', var.values)
591-
if new_order != old_order:
592-
offset = len(new_order)
593-
column = values if data.ndim > 1 else data
594-
column += offset
595-
for i, val in enumerate(var.values):
596-
try:
597-
oldval = old_order.index(val)
598-
except ValueError:
599-
continue
600-
bn.replace(column, offset + oldval, new_order.index(val))
601-
602-
if isinstance(var, TimeVariable) or coltype is TimeVariable:
603-
# Re-parse the values because only now after coltype.make call
604-
# above, variable var is the correct one
605-
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
606-
values = [_var.parse(i) for i in orig_values]
607-
608635
# Write back the changed data. This is needeed to pass the
609636
# correct, converted values into Table.from_numpy below
610637
try:

0 commit comments

Comments
 (0)