Skip to content

Commit 7c447b1

Browse files
committed
pandas_compat: fix table_from_frames for "normal" dataframe
1 parent 0013e5a commit 7c447b1

File tree

2 files changed

+88
-110
lines changed

2 files changed

+88
-110
lines changed

Orange/data/pandas_compat.py

Lines changed: 74 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,26 @@ def __finalize__(self, other, method=None, **_):
146146
pd.DataFrame.__finalize__ = __finalize__
147147

148148

149+
def _reset_index(df: pd.DataFrame) -> pd.DataFrame:
150+
"""If df index is not a simple RangeIndex (or similar), include it into a table"""
151+
if (
152+
# not range-like index - test first to skip slow startswith(_o) check
153+
not (
154+
df.index.is_integer()
155+
and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
156+
)
157+
# check that it does not contain Orange index
158+
and (
159+
# startswith is slow (for long dfs) - firs check if col has strings
160+
isinstance(df.index, pd.MultiIndex)
161+
or not is_string_dtype(df.index)
162+
or not any(str(i).startswith("_o") for i in df.index)
163+
)
164+
):
165+
df = df.reset_index()
166+
return df
167+
168+
149169
def _is_discrete(s, force_nominal):
150170
return (is_categorical_dtype(s) or
151171
is_object_dtype(s) and (force_nominal or
@@ -207,136 +227,81 @@ def col_type(dt):
207227
).values
208228

209229

210-
def vars_from_df(df, role=None, force_nominal=False):
211-
if role is None and hasattr(df, 'orange_role'):
212-
_role = df.orange_role
213-
else:
214-
_role = role
230+
def to_categorical(s, _):
231+
x = s.astype("category").cat.codes
232+
# it is same than x.replace(-1, np.nan), but much faster
233+
x = x.where(x != -1, np.nan)
234+
return np.asarray(x)
215235

216-
# If df index is not a simple RangeIndex (or similar), put it into data
217-
if (
218-
# not range-like index - test first to skip slow startswith(_o) check
219-
not (
220-
df.index.is_integer()
221-
and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
222-
)
223-
# check that it does not contain Orange index
224-
and (
225-
# startswith is slow (for long drs) - firs check if col has strings
226-
isinstance(df.index, pd.MultiIndex)
227-
or not is_string_dtype(df.index)
228-
or not any(str(i).startswith("_o") for i in df.index)
229-
)
230-
):
231-
df = df.reset_index()
232236

233-
Xcols, Ycols, Mcols = [], [], []
234-
Xexpr, Yexpr, Mexpr = [], [], []
235-
attrs, class_vars, metas = [], [], []
237+
def vars_from_df(df, role=None, force_nominal=False):
238+
if role is None and hasattr(df, 'orange_role'):
239+
role = df.orange_role
240+
df = _reset_index(df)
236241

237-
contains_strings = _role == Role.Meta
242+
cols = [], [], []
243+
exprs = [], [], []
244+
vars_ = [], [], []
238245

239246
for column in df.columns:
240247
s = df[column]
248+
_role = Role.Attribute if role is None else role
241249
if hasattr(df, 'orange_variables') and column in df.orange_variables:
242250
original_var = df.orange_variables[column]
243251
var = original_var.copy(compute_value=None)
244-
if _role == Role.Attribute:
245-
Xcols.append(column)
246-
Xexpr.append(None)
247-
attrs.append(var)
248-
elif _role == Role.ClassAttribute:
249-
Ycols.append(column)
250-
Yexpr.append(None)
251-
class_vars.append(var)
252-
else: # if role == Role.Meta:
253-
Mcols.append(column)
254-
Mexpr.append(None)
255-
metas.append(var)
252+
expr = None
256253
elif _is_datetime(s):
257254
var = TimeVariable(str(column))
258-
attrs.append(var)
259-
Xcols.append(column)
260-
Xexpr.append(_convert_datetime)
255+
expr = _convert_datetime
261256
elif _is_discrete(s, force_nominal):
262-
discrete = s.astype('category').cat
263-
var = DiscreteVariable(str(column),
264-
discrete.categories.astype(str).tolist())
265-
attrs.append(var)
266-
Xcols.append(column)
267-
268-
def to_cat(s, _):
269-
x = s.astype("category").cat.codes
270-
# it is same than x.replace(-1, np.nan), but much faster
271-
x = x.where(x != -1, np.nan)
272-
return np.asarray(x)
273-
274-
Xexpr.append(to_cat)
257+
discrete = s.astype("category").cat
258+
var = DiscreteVariable(
259+
str(column), discrete.categories.astype(str).tolist()
260+
)
261+
expr = to_categorical
275262
elif is_numeric_dtype(s):
276263
var = ContinuousVariable(
277264
# set number of decimals to 0 if int else keeps default behaviour
278265
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
279266
)
280-
attrs.append(var)
281-
Xcols.append(column)
282-
Xexpr.append(None)
267+
expr = None
283268
else:
284-
contains_strings = True
269+
if role is not None and role != Role.Meta:
270+
raise ValueError("String variable must be in metas.")
271+
_role = Role.Meta
285272
var = StringVariable(str(column))
286-
metas.append(var)
287-
Mcols.append(column)
288-
Mexpr.append(lambda s, _: np.asarray(s, dtype=object))
289-
290-
# if role isn't explicitly set, try to
291-
# export dataframes into one contiguous block.
292-
# for this all columns must be of the same role
293-
if isinstance(df, OrangeDataFrame) \
294-
and not role \
295-
and contains_strings \
296-
and not force_nominal:
297-
attrs.extend(class_vars)
298-
attrs.extend(metas)
299-
metas = attrs
300-
Xcols.extend(Ycols)
301-
Xcols.extend(Mcols)
302-
Mcols = Xcols
303-
Xexpr.extend(Yexpr)
304-
Xexpr.extend(Mexpr)
305-
Mexpr = Xexpr
306-
307-
attrs, class_vars = [], []
308-
Xcols, Ycols = [], []
309-
Xexpr, Yexpr = [], []
310-
311-
XYM = []
312-
for Avars, Acols, Aexpr in zip(
313-
(attrs, class_vars, metas),
314-
(Xcols, Ycols, Mcols),
315-
(Xexpr, Yexpr, Mexpr)):
316-
if not Acols:
317-
A = None if Acols != Xcols else np.empty((df.shape[0], 0))
318-
XYM.append(A)
319-
continue
320-
if not any(Aexpr):
321-
Adf = df if all(c in Acols
322-
for c in df.columns) else df[Acols]
323-
if all(isinstance(a, SparseDtype) for a in Adf.dtypes):
324-
A = csr_matrix(Adf.sparse.to_coo())
273+
expr = lambda s, _: np.asarray(s, dtype=object)
274+
275+
cols[_role].append(column)
276+
exprs[_role].append(expr)
277+
vars_[_role].append(var)
278+
279+
xym = []
280+
for a_vars, a_cols, a_expr in zip(vars_, cols, exprs):
281+
if not a_cols:
282+
arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0))
283+
elif not any(a_expr):
284+
# if all c in columns table will share memory with dataframe
285+
a_df = df if all(c in a_cols for c in df.columns) else df[a_cols]
286+
if all(isinstance(a, SparseDtype) for a in a_df.dtypes):
287+
arr = csr_matrix(a_df.sparse.to_coo())
325288
else:
326-
A = np.asarray(Adf)
327-
XYM.append(A)
328-
continue
329-
# we'll have to copy the table to resolve any expressions
330-
# TODO eliminate expr (preprocessing for pandas -> table)
331-
A = np.array([expr(df[col], var) if expr else np.asarray(df[col])
332-
for var, col, expr in zip(Avars, Acols, Aexpr)]).T
333-
XYM.append(A)
289+
arr = np.asarray(a_df)
290+
else:
291+
# we'll have to copy the table to resolve any expressions
292+
arr = np.array(
293+
[
294+
expr(df[col], var) if expr else np.asarray(df[col])
295+
for var, col, expr in zip(a_vars, a_cols, a_expr)
296+
]
297+
).T
298+
xym.append(arr)
334299

335300
# Let the tables share memory with pandas frame
336-
if XYM[1] is not None and XYM[1].ndim == 2 and XYM[1].shape[1] == 1:
337-
XYM[1] = XYM[1][:, 0]
301+
if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1:
302+
xym[1] = xym[1][:, 0]
338303

339-
return XYM, Domain(attrs, class_vars, metas)
304+
return xym, Domain(*vars_)
340305

341306

342307
def table_from_frame(df, *, force_nominal=False):
@@ -396,13 +361,12 @@ def table_from_frames(xdf, ydf, mdf):
396361
W = None
397362
for df in dfs:
398363
if isinstance(df, OrangeDataFrame):
399-
W = [df.orange_weights[i] for i in df.index
400-
if i in df.orange_weights]
364+
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
401365
if len(W) != len(df.index):
402366
W = None
367+
attributes.update(df.orange_attributes)
403368
else:
404369
W = None
405-
attributes.update(df.orange_attributes)
406370

407371
return Table.from_numpy(
408372
domain,

Orange/data/tests/test_pandas.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,20 @@ def _get_orange_demo_datasets():
428428
self.assertEqual(len(df), len(table), assert_message)
429429
self.assertEqual(len(df.columns), len(table.domain.variables), assert_message)
430430

431+
def test_table_from_frames_not_orange_dataframe(self):
432+
x = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["x1", "x2", "x3"])
433+
y = pd.DataFrame([[5], [6]], columns=["y"])
434+
m = pd.DataFrame([[1, 2], [4, 5]], columns=["m1", "m2"])
435+
new_table = Table.from_pandas_dfs(x, y, m)
436+
437+
np.testing.assert_array_equal(x, new_table.X)
438+
np.testing.assert_array_equal(y.values.flatten(), new_table.Y)
439+
np.testing.assert_array_equal(m, new_table.metas)
440+
d = new_table.domain
441+
self.assertListEqual(x.columns.tolist(), [a.name for a in d.attributes])
442+
self.assertEqual(y.columns[0], d.class_var.name)
443+
self.assertListEqual(m.columns.tolist(), [a.name for a in d.metas])
444+
431445

432446
class TestTablePandas(unittest.TestCase):
433447
def setUp(self):

0 commit comments

Comments
 (0)