88import warnings
99
1010from ast import literal_eval
11- from collections import OrderedDict , Counter
11+ from collections import OrderedDict , Counter , defaultdict
1212from functools import lru_cache
1313from importlib import import_module
1414from itertools import chain , repeat
2121from fnmatch import fnmatch
2222from glob import glob
2323
24- import bottleneck as bn
2524import numpy as np
2625from chardet .universaldetector import UniversalDetector
2726
@@ -114,7 +113,7 @@ def guess_data_type(orig_values):
114113 """
115114 Use heuristics to guess data type.
116115 """
117- valuemap , values = [] , orig_values
116+ valuemap , values = None , orig_values
118117 is_discrete = is_discrete_values (orig_values )
119118 if is_discrete :
120119 valuemap = sorted (is_discrete )
@@ -137,19 +136,7 @@ def guess_data_type(orig_values):
137136
138137def sanitize_variable (valuemap , values , orig_values , coltype , coltype_kwargs ,
139138 domain_vars , existing_var , new_var_name , data = None ):
140- def reorder_values ():
141- new_order , old_order = \
142- var .values , coltype_kwargs .get ('values' , var .values )
143- if new_order != old_order :
144- offset = len (new_order )
145- column = values if data .ndim > 1 else data
146- column += offset
147- for _ , val in enumerate (var .values ):
148- try :
149- oldval = old_order .index (val )
150- except ValueError :
151- continue
152- bn .replace (column , offset + oldval , new_order .index (val ))
139+ assert issubclass (coltype , Variable )
153140
154141 def get_number_of_decimals (values ):
155142 len_ = len
@@ -158,32 +145,33 @@ def get_number_of_decimals(values):
158145 default = 1 )
159146 return ndecimals - 1
160147
161- if valuemap :
162- # Map discrete data to ints
163- def valuemap_index (val ):
164- try :
165- return valuemap .index (val )
166- except ValueError :
167- return np .nan
168-
169- values = np .vectorize (valuemap_index , otypes = [float ])(orig_values )
148+ if issubclass (coltype , DiscreteVariable ) and valuemap is not None :
170149 coltype_kwargs .update (values = valuemap )
171150
172- if coltype is StringVariable :
173- values = orig_values
151+ if existing_var :
152+ # Use existing variable if available
153+ var = coltype .make (existing_var .strip (), ** coltype_kwargs )
154+ else :
155+ # Never use existing for un-named variables
156+ var = coltype (new_var_name , ** coltype_kwargs )
157+
158+ if isinstance (var , DiscreteVariable ):
159+ # Map discrete data to 'ints' (or at least what passes as int around
160+ # here)
161+ mapping = defaultdict (
162+ lambda : np .nan ,
163+ {val : i for i , val in enumerate (var .values )},
164+ )
165+ mapping ["" ] = np .nan
166+ mapvalues_ = np .frompyfunc (mapping .__getitem__ , 1 , 1 )
174167
175- var = None
176- if domain_vars is not None :
177- if existing_var :
178- # Use existing variable if available
179- var = coltype .make (existing_var .strip (), ** coltype_kwargs )
180- else :
181- # Never use existing for un-named variables
182- var = coltype (new_var_name , ** coltype_kwargs )
168+ def mapvalues (arr ):
169+ arr = np .asarray (arr , dtype = object )
170+ return mapvalues_ (arr , out = np .empty_like (arr , dtype = float ))
171+ values = mapvalues (orig_values )
183172
184- if var .is_discrete and not var .ordered :
185- # Reorder discrete values to match existing variable
186- reorder_values ()
173+ if coltype is StringVariable :
174+ values = orig_values
187175
188176 # ContinuousVariable.number_of_decimals is supposed to be handled by
189177 # ContinuousVariable.to_val. In the interest of speed, the reader bypasses
@@ -643,7 +631,7 @@ def _equal_length(lst):
643631 orig_values [namask ] = ""
644632
645633 coltype_kwargs = {}
646- valuemap = []
634+ valuemap = None
647635 values = orig_values
648636
649637 if type_flag in StringVariable .TYPE_HEADERS :
@@ -699,9 +687,12 @@ def _equal_length(lst):
699687 if not existing_var :
700688 new_var_name = next (NAMEGEN )
701689
702- values , var = sanitize_variable (
703- valuemap , values , orig_values , coltype , coltype_kwargs ,
704- domain_vars , existing_var , new_var_name , data )
690+ if domain_vars is not None :
691+ values , var = sanitize_variable (
692+ valuemap , values , orig_values , coltype , coltype_kwargs ,
693+ domain_vars , existing_var , new_var_name , data )
694+ else :
695+ var = None
705696 if domain_vars is not None :
706697 var .attributes .update (flag .attributes )
707698 domain_vars .append (var )
0 commit comments