Skip to content

Commit 06709e9

Browse files
authored
Merge pull request #2836 from kernc/pandas_compat
ENH: Add pandas_compat.table_from_frame(df)
2 parents 4f52866 + e602be2 commit 06709e9

File tree

5 files changed

+143
-6
lines changed

5 files changed

+143
-6
lines changed

.travis/install_orange.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ foldable pip install -U setuptools pip codecov
77
cat requirements-core.txt \
88
requirements-gui.txt \
99
requirements-dev.txt \
10+
requirements-opt.txt \
1011
requirements-doc.txt |
1112
while read dep; do
1213
dep="${dep%%#*}" # Strip the comment

Orange/data/pandas_compat.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
"""Pandas DataFrame↔Table conversion helpers"""
2+
import numpy as np
3+
import pandas as pd
4+
from pandas.api.types import (
5+
is_categorical_dtype, is_object_dtype,
6+
is_datetime64_any_dtype, is_numeric_dtype,
7+
)
8+
9+
from Orange.data import (
10+
Table, Domain, DiscreteVariable, StringVariable, TimeVariable,
11+
ContinuousVariable,
12+
)
13+
14+
__all__ = ['table_from_frame']
15+
16+
17+
def table_from_frame(df, *, force_nominal=False):
18+
"""
19+
Convert pandas.DataFrame to Orange.data.Table
20+
21+
Parameters
22+
----------
23+
df : pandas.DataFrame
24+
force_nominal : boolean
25+
If True, interpret ALL string columns as nominal (DiscreteVariable).
26+
27+
Returns
28+
-------
29+
Table
30+
"""
31+
32+
def _is_discrete(s):
33+
return (is_categorical_dtype(s) or
34+
is_object_dtype(s) and (force_nominal or
35+
s.nunique() < s.size**.666))
36+
37+
def _is_datetime(s):
38+
if is_datetime64_any_dtype(s):
39+
return True
40+
try:
41+
if is_object_dtype(s):
42+
pd.to_datetime(s, infer_datetime_format=True)
43+
return True
44+
except Exception: # pylint: disable=broad-except
45+
pass
46+
return False
47+
48+
# If df index is not a simple RangeIndex (or similar), put it into data
49+
if not (df.index.is_integer() and (df.index.is_monotonic_increasing or
50+
df.index.is_monotonic_decreasing)):
51+
df = df.reset_index()
52+
53+
attrs, metas = [], []
54+
X, M = [], []
55+
56+
# Iter over columns
57+
for name, s in df.items():
58+
name = str(name)
59+
if _is_discrete(s):
60+
discrete = s.astype('category').cat
61+
attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
62+
X.append(discrete.codes.replace(-1, np.nan).values)
63+
elif _is_datetime(s):
64+
tvar = TimeVariable(name)
65+
attrs.append(tvar)
66+
s = pd.to_datetime(s, infer_datetime_format=True)
67+
X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
68+
elif is_numeric_dtype(s):
69+
attrs.append(ContinuousVariable(name))
70+
X.append(s.values)
71+
else:
72+
metas.append(StringVariable(name))
73+
M.append(s.values.astype(object))
74+
75+
return Table.from_numpy(Domain(attrs, None, metas),
76+
np.column_stack(X) if X else np.empty((df.shape[0], 0)),
77+
None,
78+
np.column_stack(M) if M else None)

Orange/data/tests/test_pandas.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import unittest
2+
import numpy as np
3+
from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable
4+
5+
try:
6+
import pandas as pd
7+
except ImportError:
8+
pd = None
9+
10+
@unittest.skipIf(pd is None, "Missing package 'pandas'")
11+
class TestPandasCompat(unittest.TestCase):
12+
def test_table_from_frame(self):
13+
from Orange.data.pandas_compat import table_from_frame
14+
15+
nan = np.nan
16+
df = pd.DataFrame([['a', 1, pd.Timestamp('2017-12-19')],
17+
['b', 0, pd.Timestamp('1724-12-20')],
18+
['c', 0, pd.Timestamp('1724-12-20')],
19+
[nan, nan, nan]])
20+
table = table_from_frame(df)
21+
np.testing.assert_equal(table.X,
22+
[[1, pd.Timestamp('2017-12-19').timestamp()],
23+
[0, pd.Timestamp('1724-12-20').timestamp()],
24+
[0, pd.Timestamp('1724-12-20').timestamp()],
25+
[nan, nan]])
26+
np.testing.assert_equal(table.metas.tolist(), [['a'],
27+
['b'],
28+
['c'],
29+
[nan]])
30+
names = [var.name for var in table.domain.attributes]
31+
types = [type(var) for var in table.domain.attributes]
32+
self.assertEqual(names, ['1', '2'])
33+
self.assertEqual(types, [ContinuousVariable, TimeVariable])
34+
35+
# Force strings nominal
36+
table = table_from_frame(df, force_nominal=True)
37+
np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()],
38+
[1, 0, pd.Timestamp('1724-12-20').timestamp()],
39+
[2, 0, pd.Timestamp('1724-12-20').timestamp()],
40+
[nan, nan, nan]])
41+
np.testing.assert_equal(table.metas.tolist(), [[], [], [], []])
42+
names = [var.name for var in table.domain.attributes]
43+
types = [type(var) for var in table.domain.attributes]
44+
self.assertEqual(names, ['0', '1', '2'])
45+
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
46+
47+
# Include index
48+
df.index = list('abaa')
49+
table = table_from_frame(df)
50+
np.testing.assert_equal(table.X,
51+
[[0, 1, pd.Timestamp('2017-12-19').timestamp()],
52+
[1, 0, pd.Timestamp('1724-12-20').timestamp()],
53+
[0, 0, pd.Timestamp('1724-12-20').timestamp()],
54+
[0, nan, nan]])
55+
np.testing.assert_equal(table.metas.tolist(), [['a'],
56+
['b'],
57+
['c'],
58+
[nan]])
59+
names = [var.name for var in table.domain.attributes]
60+
types = [type(var) for var in table.domain.attributes]
61+
self.assertEqual(names, ['index', '1', '2'])
62+
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])

appveyor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ environment:
2121
BUILD_GLOBAL_OPTIONS: build -j1
2222
BUILD_ENV: wheel==0.29.0 pip==9.0.1 numpy==1.9.3
2323
# SIP 4.19.4+ with PyQt5==5.9.1+ segfault our tests (GH-2756)
24-
TEST_ENV: sip==4.19.6 PyQt5==5.9.2 numpy==1.12.1 scipy==1.0.0b1 scikit-learn
24+
TEST_ENV: sip==4.19.6 PyQt5==5.9.2 numpy==1.12.1 scipy==1.0.0b1 scikit-learn pandas==0.21.1
2525

2626
matrix:
2727
- PYTHON: C:\Python34

requirements-opt.txt

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1 @@
1-
# This is required for, and only used by, Parallel Coordinates widget.
2-
# Once that is ported to whatever, this can be removed, along with
3-
# Orange/widgets/utils/plot/*
4-
# Optional because it's hard to install everywhere.
5-
qt-graph-helpers>=0.1.3
1+
pandas

0 commit comments

Comments
 (0)