Skip to content

Commit c30fa4c

Browse files
authored
Merge pull request #3180 from apetrov/ap-table_to_frame
[ENH] Add pandas_compat.table_to_frame(tab)
2 parents 000bcad + 789a0c9 commit c30fa4c

File tree

2 files changed

+87
-2
lines changed

2 files changed

+87
-2
lines changed

Orange/data/pandas_compat.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
ContinuousVariable,
1212
)
1313

14-
__all__ = ['table_from_frame']
14+
__all__ = ['table_from_frame', 'table_to_frame']
1515

1616

1717
def table_from_frame(df, *, force_nominal=False):
@@ -76,3 +76,52 @@ def _is_datetime(s):
7676
np.column_stack(X) if X else np.empty((df.shape[0], 0)),
7777
None,
7878
np.column_stack(M) if M else None)
79+
80+
81+
def table_to_frame(tab):
82+
"""
83+
Convert Orange.data.Table to pandas.DataFrame
84+
85+
Parameters
86+
----------
87+
tab : Table
88+
89+
Returns
90+
-------
91+
pandas.DataFrame
92+
"""
93+
def _column_to_series(col, vals):
94+
result = ()
95+
if col.is_discrete:
96+
codes = pd.Series(vals).fillna(-1).astype(int)
97+
result = (col.name, pd.Categorical.from_codes(codes=codes, categories=col.values,
98+
ordered=col.ordered))
99+
elif col.is_time:
100+
result = (col.name, pd.to_datetime(vals, unit='s').to_series().reset_index()[0])
101+
elif col.is_continuous:
102+
dt = float
103+
# np.nan are not compatible with int column
104+
nan_values_in_column = [t for t in vals if np.isnan(t)]
105+
if col.number_of_decimals == 0 and len(nan_values_in_column) == 0:
106+
dt = int
107+
result = (col.name, pd.Series(vals).astype(dt))
108+
elif col.is_string:
109+
result = (col.name, pd.Series(vals))
110+
return result
111+
112+
def _columns_to_series(cols, vals):
113+
return [_column_to_series(col, vals[:, i]) for i, col in enumerate(cols)]
114+
115+
x, y, metas = [], [], []
116+
domain = tab.domain
117+
if domain.attributes:
118+
x = _columns_to_series(domain.attributes, tab.X)
119+
if domain.class_vars:
120+
y_values = tab.Y.reshape(tab.Y.shape[0], len(domain.class_vars))
121+
y = _columns_to_series(domain.class_vars, y_values)
122+
if domain.metas:
123+
metas = _columns_to_series(domain.metas, tab.metas)
124+
all_series = dict(x + y + metas)
125+
original_column_order = [var.name for var in tab.domain.variables]
126+
unsorted_columns_df = pd.DataFrame(all_series)
127+
return unsorted_columns_df[original_column_order]

Orange/data/tests/test_pandas.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unittest
22
import numpy as np
3-
from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable
3+
from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable, Table
44

55
try:
66
import pandas as pd
@@ -60,3 +60,39 @@ def test_table_from_frame(self):
6060
types = [type(var) for var in table.domain.attributes]
6161
self.assertEqual(names, ['index', '1', '2'])
6262
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
63+
64+
def test_table_to_frame(self):
65+
from Orange.data.pandas_compat import table_to_frame
66+
table = Table("iris")
67+
df = table_to_frame(table)
68+
table_column_names = [var.name for var in table.domain.variables]
69+
frame_column_names = df.columns
70+
71+
self.assertEqual(sorted(table_column_names), sorted(frame_column_names))
72+
self.assertEqual(type(df['iris'].dtype), pd.api.types.CategoricalDtype)
73+
self.assertEqual(list(df['sepal length'])[0:4], [5.1, 4.9, 4.7, 4.6])
74+
self.assertEqual(list(df['iris'])[0:2], ['Iris-setosa', 'Iris-setosa'])
75+
76+
@unittest.skip("Convert all Orange demo dataset. It takes about 5s which is way to slow")
77+
def test_table_to_frame_on_all_orange_dataset(self):
78+
from os import listdir
79+
from Orange.data.pandas_compat import table_to_frame
80+
import pandas as pd
81+
82+
dataset_directory = "Orange/datasets/"
83+
84+
def _filename_to_dataset_name(f):
85+
return f.split('.')[0]
86+
87+
def _get_orange_demo_datasets():
88+
x = [_filename_to_dataset_name(f) for f in listdir(dataset_directory) if '.tab' in f]
89+
return x
90+
91+
for name in _get_orange_demo_datasets():
92+
table = Table(name)
93+
df = table_to_frame(table)
94+
assert_message = "Failed to process Table('{}')".format(name)
95+
96+
self.assertEqual(type(df), pd.DataFrame, assert_message)
97+
self.assertEqual(len(df), len(table), assert_message)
98+
self.assertEqual(len(df.columns), len(table.domain), assert_message)

0 commit comments

Comments
 (0)