Skip to content

Commit e398098

Browse files
committed
io: Avoid posible worst case O(N**2) behaviour
... if number of distinct discrete values is ~= number of rows Fixes gh-1297
1 parent 0532b0c commit e398098

File tree

2 files changed

+44
-42
lines changed

2 files changed

+44
-42
lines changed

Orange/data/io.py

Lines changed: 33 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import warnings
99

1010
from ast import literal_eval
11-
from collections import OrderedDict, Counter
11+
from collections import OrderedDict, Counter, defaultdict
1212
from functools import lru_cache
1313
from importlib import import_module
1414
from itertools import chain, repeat
@@ -21,7 +21,6 @@
2121
from fnmatch import fnmatch
2222
from glob import glob
2323

24-
import bottleneck as bn
2524
import numpy as np
2625
from chardet.universaldetector import UniversalDetector
2726

@@ -114,7 +113,7 @@ def guess_data_type(orig_values):
114113
"""
115114
Use heuristics to guess data type.
116115
"""
117-
valuemap, values = [], orig_values
116+
valuemap, values = None, orig_values
118117
is_discrete = is_discrete_values(orig_values)
119118
if is_discrete:
120119
valuemap = sorted(is_discrete)
@@ -137,19 +136,7 @@ def guess_data_type(orig_values):
137136

138137
def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
139138
domain_vars, existing_var, new_var_name, data=None):
140-
def reorder_values():
141-
new_order, old_order = \
142-
var.values, coltype_kwargs.get('values', var.values)
143-
if new_order != old_order:
144-
offset = len(new_order)
145-
column = values if data.ndim > 1 else data
146-
column += offset
147-
for _, val in enumerate(var.values):
148-
try:
149-
oldval = old_order.index(val)
150-
except ValueError:
151-
continue
152-
bn.replace(column, offset + oldval, new_order.index(val))
139+
assert issubclass(coltype, Variable)
153140

154141
def get_number_of_decimals(values):
155142
len_ = len
@@ -158,32 +145,33 @@ def get_number_of_decimals(values):
158145
default=1)
159146
return ndecimals - 1
160147

161-
if valuemap:
162-
# Map discrete data to ints
163-
def valuemap_index(val):
164-
try:
165-
return valuemap.index(val)
166-
except ValueError:
167-
return np.nan
168-
169-
values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
148+
if issubclass(coltype, DiscreteVariable) and valuemap is not None:
170149
coltype_kwargs.update(values=valuemap)
171150

172-
if coltype is StringVariable:
173-
values = orig_values
151+
if existing_var:
152+
# Use existing variable if available
153+
var = coltype.make(existing_var.strip(), **coltype_kwargs)
154+
else:
155+
# Never use existing for un-named variables
156+
var = coltype(new_var_name, **coltype_kwargs)
157+
158+
if isinstance(var, DiscreteVariable):
159+
# Map discrete data to 'ints' (or at least what passes as int around
160+
# here)
161+
mapping = defaultdict(
162+
lambda: np.nan,
163+
{val: i for i, val in enumerate(var.values)},
164+
)
165+
mapping[""] = np.nan
166+
mapvalues_ = np.frompyfunc(mapping.__getitem__, 1, 1)
174167

175-
var = None
176-
if domain_vars is not None:
177-
if existing_var:
178-
# Use existing variable if available
179-
var = coltype.make(existing_var.strip(), **coltype_kwargs)
180-
else:
181-
# Never use existing for un-named variables
182-
var = coltype(new_var_name, **coltype_kwargs)
168+
def mapvalues(arr):
169+
arr = np.asarray(arr, dtype=object)
170+
return mapvalues_(arr, out=np.empty_like(arr, dtype=float))
171+
values = mapvalues(orig_values)
183172

184-
if var.is_discrete and not var.ordered:
185-
# Reorder discrete values to match existing variable
186-
reorder_values()
173+
if coltype is StringVariable:
174+
values = orig_values
187175

188176
# ContinuousVariable.number_of_decimals is supposed to be handled by
189177
# ContinuousVariable.to_val. In the interest of speed, the reader bypasses
@@ -643,7 +631,7 @@ def _equal_length(lst):
643631
orig_values[namask] = ""
644632

645633
coltype_kwargs = {}
646-
valuemap = []
634+
valuemap = None
647635
values = orig_values
648636

649637
if type_flag in StringVariable.TYPE_HEADERS:
@@ -699,9 +687,12 @@ def _equal_length(lst):
699687
if not existing_var:
700688
new_var_name = next(NAMEGEN)
701689

702-
values, var = sanitize_variable(
703-
valuemap, values, orig_values, coltype, coltype_kwargs,
704-
domain_vars, existing_var, new_var_name, data)
690+
if domain_vars is not None:
691+
values, var = sanitize_variable(
692+
valuemap, values, orig_values, coltype, coltype_kwargs,
693+
domain_vars, existing_var, new_var_name, data)
694+
else:
695+
var = None
705696
if domain_vars is not None:
706697
var.attributes.update(flag.attributes)
707698
domain_vars.append(var)

Orange/tests/test_tab_reader.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import unittest
77
import tempfile
88
import shutil
9+
import time
910
from collections import OrderedDict
1011

1112
import numpy as np
@@ -257,3 +258,13 @@ def test_number_of_decimals(self):
257258
self.assertEqual(data.domain["RI"].number_of_decimals, 5)
258259
self.assertEqual(data.domain["Na"].number_of_decimals, 2)
259260
self.assertEqual(data.domain["Fe"].number_of_decimals, 2)
261+
262+
def test_many_discrete(self):
263+
b = io.StringIO()
264+
b.write("Poser\nd\n\n")
265+
b.writelines("K" + str(i) + "\n" for i in range(30000))
266+
start = time.time()
267+
_ = TabReader(b).read()
268+
elapsed = time.time() - start
269+
if elapsed > 2:
270+
raise AssertionError()

0 commit comments

Comments
 (0)