@@ -101,6 +101,80 @@ def _from_file(f):
101101 return _from_file (filename )
102102
103103
104+ def guess_data_type (orig_values ):
105+ """
106+ Use heuristics to guess data type.
107+ """
108+ valuemap , values = [], orig_values
109+ is_discrete = is_discrete_values (orig_values )
110+ if is_discrete :
111+ valuemap = sorted (is_discrete )
112+ coltype = DiscreteVariable
113+ else :
114+ try :
115+ values = [float (i ) for i in orig_values ]
116+ except ValueError :
117+ tvar = TimeVariable ('_' )
118+ try :
119+ values = [tvar .parse (i ) for i in orig_values ]
120+ except ValueError :
121+ coltype = StringVariable
122+ else :
123+ coltype = TimeVariable
124+ else :
125+ coltype = ContinuousVariable
126+ return valuemap , values , coltype
127+
128+
129+ def sanitize_variable (valuemap , values , orig_values , coltype , coltype_kwargs ,
130+ domain_vars , existing_var , new_var_name , data = None ):
131+ if valuemap :
132+ # Map discrete data to ints
133+ def valuemap_index (val ):
134+ try :
135+ return valuemap .index (val )
136+ except ValueError :
137+ return np .nan
138+
139+ values = np .vectorize (valuemap_index , otypes = [float ])(orig_values )
140+ coltype_kwargs .update (values = valuemap )
141+
142+ if coltype is StringVariable :
143+ values = ['' if i is np .nan else i for i in orig_values ]
144+
145+ var = None
146+ if domain_vars is not None :
147+ if existing_var :
148+ # Use existing variable if available
149+ var = coltype .make (existing_var .strip (), ** coltype_kwargs )
150+ else :
151+ # Never use existing for un-named variables
152+ var = coltype (new_var_name , ** coltype_kwargs )
153+
154+ # Reorder discrete values to match existing variable
155+ if var .is_discrete and not var .ordered :
156+ new_order , old_order = var .values , coltype_kwargs .get ('values' ,
157+ var .values )
158+ if new_order != old_order :
159+ offset = len (new_order )
160+ column = values if data .ndim > 1 else data
161+ column += offset
162+ for i , val in enumerate (var .values ):
163+ try :
164+ oldval = old_order .index (val )
165+ except ValueError :
166+ continue
167+ bn .replace (column , offset + oldval , new_order .index (val ))
168+
169+ if isinstance (var , TimeVariable ) or coltype is TimeVariable :
170+ # Re-parse the values because only now after coltype.make call
171+ # above, variable var is the correct one
172+ _var = var if isinstance (var , TimeVariable ) else TimeVariable ('_' )
173+ values = [_var .parse (i ) for i in orig_values ]
174+
175+ return values , var
176+
177+
104178class Flags :
105179 """Parser for column flags (i.e. third header row)"""
106180 DELIMITER = ' '
@@ -522,6 +596,7 @@ def _equal_length(lst):
522596
523597 elif (type_flag in DiscreteVariable .TYPE_HEADERS or
524598 _RE_DISCRETE_LIST .match (type_flag )):
599+ coltype = DiscreteVariable
525600 if _RE_DISCRETE_LIST .match (type_flag ):
526601 valuemap = Flags .split (type_flag )
527602 coltype_kwargs .update (ordered = True )
@@ -530,38 +605,7 @@ def _equal_length(lst):
530605
531606 else :
532607 # No known type specified, use heuristics
533- is_discrete = is_discrete_values (orig_values )
534- if is_discrete :
535- valuemap = sorted (is_discrete )
536- else :
537- try :
538- values = [float (i ) for i in orig_values ]
539- except ValueError :
540- tvar = TimeVariable ('_' )
541- try :
542- values = [tvar .parse (i ) for i in orig_values ]
543- except ValueError :
544- coltype = StringVariable
545- else :
546- coltype = TimeVariable
547- else :
548- coltype = ContinuousVariable
549-
550- if valuemap :
551- # Map discrete data to ints
552- def valuemap_index (val ):
553- try :
554- return valuemap .index (val )
555- except ValueError :
556- return np .nan
557-
558- values = np .vectorize (valuemap_index , otypes = [float ])(orig_values )
559- coltype = DiscreteVariable
560- coltype_kwargs .update (values = valuemap )
561-
562- if coltype is StringVariable :
563- values = ['' if i is np .nan else i
564- for i in orig_values ]
608+ valuemap , values , coltype = guess_data_type (orig_values )
565609
566610 if flag .m or coltype is StringVariable :
567611 append_to = (Mcols , metas )
@@ -574,37 +618,20 @@ def valuemap_index(val):
574618
575619 cols , domain_vars = append_to
576620 cols .append (col )
577- var = None
621+
622+ existing_var , new_var_name , column = None , None , None
623+ if domain_vars is not None :
624+ existing_var = names and names [col ]
625+ if not existing_var :
626+ new_var_name = next (NAMEGEN )
627+
628+ values , var = sanitize_variable (
629+ valuemap , values , orig_values , coltype , coltype_kwargs ,
630+ domain_vars , existing_var , new_var_name , data )
578631 if domain_vars is not None :
579- if names and names [col ]:
580- # Use existing variable if available
581- var = coltype .make (names [col ].strip (), ** coltype_kwargs )
582- else :
583- # Never use existing for un-named variables
584- var = coltype (next (NAMEGEN ), ** coltype_kwargs )
585632 var .attributes .update (flag .attributes )
586633 domain_vars .append (var )
587634
588- # Reorder discrete values to match existing variable
589- if var .is_discrete and not var .ordered :
590- new_order , old_order = var .values , coltype_kwargs .get ('values' , var .values )
591- if new_order != old_order :
592- offset = len (new_order )
593- column = values if data .ndim > 1 else data
594- column += offset
595- for i , val in enumerate (var .values ):
596- try :
597- oldval = old_order .index (val )
598- except ValueError :
599- continue
600- bn .replace (column , offset + oldval , new_order .index (val ))
601-
602- if isinstance (var , TimeVariable ) or coltype is TimeVariable :
603- # Re-parse the values because only now after coltype.make call
604- # above, variable var is the correct one
605- _var = var if isinstance (var , TimeVariable ) else TimeVariable ('_' )
606- values = [_var .parse (i ) for i in orig_values ]
607-
608635 # Write back the changed data. This is needeed to pass the
609636 # correct, converted values into Table.from_numpy below
610637 try :
0 commit comments