@@ -146,6 +146,26 @@ def __finalize__(self, other, method=None, **_):
146146 pd .DataFrame .__finalize__ = __finalize__
147147
148148
149+ def _reset_index (df : pd .DataFrame ) -> pd .DataFrame :
150+ """If df index is not a simple RangeIndex (or similar), include it into a table"""
151+ if (
152+ # not range-like index - test first to skip slow startswith(_o) check
153+ not (
154+ df .index .is_integer ()
155+ and (df .index .is_monotonic_increasing or df .index .is_monotonic_decreasing )
156+ )
157+ # check that it does not contain Orange index
158+ and (
159+ # startswith is slow (for long dfs) - firs check if col has strings
160+ isinstance (df .index , pd .MultiIndex )
161+ or not is_string_dtype (df .index )
162+ or not any (str (i ).startswith ("_o" ) for i in df .index )
163+ )
164+ ):
165+ df = df .reset_index ()
166+ return df
167+
168+
149169def _is_discrete (s , force_nominal ):
150170 return (is_categorical_dtype (s ) or
151171 is_object_dtype (s ) and (force_nominal or
@@ -207,136 +227,81 @@ def col_type(dt):
207227 ).values
208228
209229
210- def vars_from_df ( df , role = None , force_nominal = False ):
211- if role is None and hasattr ( df , 'orange_role' ):
212- _role = df . orange_role
213- else :
214- _role = role
230+ def to_categorical ( s , _ ):
231+ x = s . astype ( "category" ). cat . codes
232+ # it is same than x.replace(-1, np.nan), but much faster
233+ x = x . where ( x != - 1 , np . nan )
234+ return np . asarray ( x )
215235
216- # If df index is not a simple RangeIndex (or similar), put it into data
217- if (
218- # not range-like index - test first to skip slow startswith(_o) check
219- not (
220- df .index .is_integer ()
221- and (df .index .is_monotonic_increasing or df .index .is_monotonic_decreasing )
222- )
223- # check that it does not contain Orange index
224- and (
225- # startswith is slow (for long drs) - firs check if col has strings
226- isinstance (df .index , pd .MultiIndex )
227- or not is_string_dtype (df .index )
228- or not any (str (i ).startswith ("_o" ) for i in df .index )
229- )
230- ):
231- df = df .reset_index ()
232236
233- Xcols , Ycols , Mcols = [], [], []
234- Xexpr , Yexpr , Mexpr = [], [], []
235- attrs , class_vars , metas = [], [], []
237+ def vars_from_df (df , role = None , force_nominal = False ):
238+ if role is None and hasattr (df , 'orange_role' ):
239+ role = df .orange_role
240+ df = _reset_index (df )
236241
237- contains_strings = _role == Role .Meta
242+ cols = [], [], []
243+ exprs = [], [], []
244+ vars_ = [], [], []
238245
239246 for column in df .columns :
240247 s = df [column ]
248+ _role = Role .Attribute if role is None else role
241249 if hasattr (df , 'orange_variables' ) and column in df .orange_variables :
242250 original_var = df .orange_variables [column ]
243251 var = original_var .copy (compute_value = None )
244- if _role == Role .Attribute :
245- Xcols .append (column )
246- Xexpr .append (None )
247- attrs .append (var )
248- elif _role == Role .ClassAttribute :
249- Ycols .append (column )
250- Yexpr .append (None )
251- class_vars .append (var )
252- else : # if role == Role.Meta:
253- Mcols .append (column )
254- Mexpr .append (None )
255- metas .append (var )
252+ expr = None
256253 elif _is_datetime (s ):
257254 var = TimeVariable (str (column ))
258- attrs .append (var )
259- Xcols .append (column )
260- Xexpr .append (_convert_datetime )
255+ expr = _convert_datetime
261256 elif _is_discrete (s , force_nominal ):
262- discrete = s .astype ('category' ).cat
263- var = DiscreteVariable (str (column ),
264- discrete .categories .astype (str ).tolist ())
265- attrs .append (var )
266- Xcols .append (column )
267-
268- def to_cat (s , _ ):
269- x = s .astype ("category" ).cat .codes
270- # it is same than x.replace(-1, np.nan), but much faster
271- x = x .where (x != - 1 , np .nan )
272- return np .asarray (x )
273-
274- Xexpr .append (to_cat )
257+ discrete = s .astype ("category" ).cat
258+ var = DiscreteVariable (
259+ str (column ), discrete .categories .astype (str ).tolist ()
260+ )
261+ expr = to_categorical
275262 elif is_numeric_dtype (s ):
276263 var = ContinuousVariable (
277264 # set number of decimals to 0 if int else keeps default behaviour
278265 str (column ), number_of_decimals = (0 if is_integer_dtype (s ) else None )
279266 )
280- attrs .append (var )
281- Xcols .append (column )
282- Xexpr .append (None )
267+ expr = None
283268 else :
284- contains_strings = True
269+ if role is not None and role != Role .Meta :
270+ raise ValueError ("String variable must be in metas." )
271+ _role = Role .Meta
285272 var = StringVariable (str (column ))
286- metas .append (var )
287- Mcols .append (column )
288- Mexpr .append (lambda s , _ : np .asarray (s , dtype = object ))
289-
290- # if role isn't explicitly set, try to
291- # export dataframes into one contiguous block.
292- # for this all columns must be of the same role
293- if isinstance (df , OrangeDataFrame ) \
294- and not role \
295- and contains_strings \
296- and not force_nominal :
297- attrs .extend (class_vars )
298- attrs .extend (metas )
299- metas = attrs
300- Xcols .extend (Ycols )
301- Xcols .extend (Mcols )
302- Mcols = Xcols
303- Xexpr .extend (Yexpr )
304- Xexpr .extend (Mexpr )
305- Mexpr = Xexpr
306-
307- attrs , class_vars = [], []
308- Xcols , Ycols = [], []
309- Xexpr , Yexpr = [], []
310-
311- XYM = []
312- for Avars , Acols , Aexpr in zip (
313- (attrs , class_vars , metas ),
314- (Xcols , Ycols , Mcols ),
315- (Xexpr , Yexpr , Mexpr )):
316- if not Acols :
317- A = None if Acols != Xcols else np .empty ((df .shape [0 ], 0 ))
318- XYM .append (A )
319- continue
320- if not any (Aexpr ):
321- Adf = df if all (c in Acols
322- for c in df .columns ) else df [Acols ]
323- if all (isinstance (a , SparseDtype ) for a in Adf .dtypes ):
324- A = csr_matrix (Adf .sparse .to_coo ())
273+ expr = lambda s , _ : np .asarray (s , dtype = object )
274+
275+ cols [_role ].append (column )
276+ exprs [_role ].append (expr )
277+ vars_ [_role ].append (var )
278+
279+ xym = []
280+ for a_vars , a_cols , a_expr in zip (vars_ , cols , exprs ):
281+ if not a_cols :
282+ arr = None if a_cols != cols [0 ] else np .empty ((df .shape [0 ], 0 ))
283+ elif not any (a_expr ):
284+ # if all c in columns table will share memory with dataframe
285+ a_df = df if all (c in a_cols for c in df .columns ) else df [a_cols ]
286+ if all (isinstance (a , SparseDtype ) for a in a_df .dtypes ):
287+ arr = csr_matrix (a_df .sparse .to_coo ())
325288 else :
326- A = np .asarray (Adf )
327- XYM .append (A )
328- continue
329- # we'll have to copy the table to resolve any expressions
330- # TODO eliminate expr (preprocessing for pandas -> table)
331- A = np .array ([expr (df [col ], var ) if expr else np .asarray (df [col ])
332- for var , col , expr in zip (Avars , Acols , Aexpr )]).T
333- XYM .append (A )
289+ arr = np .asarray (a_df )
290+ else :
291+ # we'll have to copy the table to resolve any expressions
292+ arr = np .array (
293+ [
294+ expr (df [col ], var ) if expr else np .asarray (df [col ])
295+ for var , col , expr in zip (a_vars , a_cols , a_expr )
296+ ]
297+ ).T
298+ xym .append (arr )
334299
335300 # Let the tables share memory with pandas frame
336- if XYM [1 ] is not None and XYM [1 ].ndim == 2 and XYM [1 ].shape [1 ] == 1 :
337- XYM [1 ] = XYM [1 ][:, 0 ]
301+ if xym [1 ] is not None and xym [1 ].ndim == 2 and xym [1 ].shape [1 ] == 1 :
302+ xym [1 ] = xym [1 ][:, 0 ]
338303
339- return XYM , Domain (attrs , class_vars , metas )
304+ return xym , Domain (* vars_ )
340305
341306
342307def table_from_frame (df , * , force_nominal = False ):
@@ -396,13 +361,12 @@ def table_from_frames(xdf, ydf, mdf):
396361 W = None
397362 for df in dfs :
398363 if isinstance (df , OrangeDataFrame ):
399- W = [df .orange_weights [i ] for i in df .index
400- if i in df .orange_weights ]
364+ W = [df .orange_weights [i ] for i in df .index if i in df .orange_weights ]
401365 if len (W ) != len (df .index ):
402366 W = None
367+ attributes .update (df .orange_attributes )
403368 else :
404369 W = None
405- attributes .update (df .orange_attributes )
406370
407371 return Table .from_numpy (
408372 domain ,
0 commit comments