Skip to content

Commit 980210d

Browse files
committed
utils: state_summary.py
1 parent 31cc232 commit 980210d

File tree

2 files changed

+258
-0
lines changed

2 files changed

+258
-0
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from Orange.data import StringVariable, DiscreteVariable, ContinuousVariable, \
2+
TimeVariable
3+
4+
5+
def format_variables_string(variables):
6+
"""
7+
A function that formats the descriptive part of the input/output summary for
8+
either features, targets or metas of the input dataset.
9+
10+
:param variables: Features, targets or metas of the input dataset
11+
:return: A formatted string
12+
"""
13+
agg = []
14+
for var_type_name, var_type in [('categorical', DiscreteVariable),
15+
('numeric', ContinuousVariable),
16+
('time', TimeVariable),
17+
('string', StringVariable)]:
18+
# Disable pylint here because a `TimeVariable` is also a
19+
# `ContinuousVariable`, and should be labelled as such. That is why
20+
# it is necessary to check the type this way instead of using
21+
# `isinstance`, which would fail in the above case
22+
var_type_list = [v for v in variables if type(v) is var_type] # pylint: disable=unidiomatic-typecheck
23+
if var_type_list:
24+
shown = var_type in (StringVariable,)
25+
agg.append(
26+
(f'{len(var_type_list)} ' + var_type_name +
27+
f"{['', ' (not shown)'][shown]}",
28+
len(var_type_list)))
29+
30+
if not agg:
31+
return '—'
32+
33+
attrs, counts = list(zip(*agg))
34+
if len(attrs) > 1:
35+
var_string = ', '.join(attrs[:-1]) + ', ' + attrs[-1]
36+
return f'{sum(counts)} (' + var_string + ')'
37+
elif sum(counts) == 1:
38+
var_string = attrs[0][2:]
39+
return var_string
40+
else:
41+
types = [s for s in ['categorical', 'numeric', 'time', 'string'] if
42+
s in attrs[0]]
43+
ind = attrs[0].find(types[0])
44+
var_string = attrs[0][ind:]
45+
return f'{sum(counts)} ' + var_string
46+
47+
48+
def format_summary_details(data):
49+
"""
50+
A function that forms the entire descriptive part of the input/output
51+
summary.
52+
53+
:param data: A dataset
54+
:type data: Orange.data.Table
55+
:return: A formatted string
56+
"""
57+
def _plural(number):
58+
return number, 's' * (number != 1)
59+
60+
details = ''
61+
if data:
62+
features = format_variables_string(data.domain.attributes)
63+
targets = format_variables_string(data.domain.class_vars)
64+
metas = format_variables_string(data.domain.metas)
65+
66+
n_features = len(data.domain.variables) + len(data.domain.metas)
67+
details = \
68+
f'{_plural(len(data))[0]} instance{_plural(len(data))[1]}, ' \
69+
f'{_plural(n_features)[0]} feature{_plural(n_features)[1]}\n' \
70+
f'Features: ' + features + '\n' + \
71+
f'Target: ' + targets + '\n' + \
72+
f'Metas: ' + metas
73+
74+
return details
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
import unittest
2+
import datetime
3+
from collections import namedtuple
4+
5+
import numpy as np
6+
7+
from Orange.widgets.utils.state_summary import format_summary_details
8+
from Orange.data import Table, Domain, StringVariable, ContinuousVariable, \
9+
DiscreteVariable, TimeVariable
10+
11+
VarDataPair = namedtuple('VarDataPair', ['variable', 'data'])
12+
13+
# Continuous variable variations
14+
continuous_full = VarDataPair(
15+
ContinuousVariable('continuous_full'),
16+
np.array([0, 1, 2, 3, 4], dtype=float),
17+
)
18+
continuous_missing = VarDataPair(
19+
ContinuousVariable('continuous_missing'),
20+
np.array([0, 1, 2, np.nan, 4], dtype=float),
21+
)
22+
23+
# Unordered discrete variable variations
24+
rgb_full = VarDataPair(
25+
DiscreteVariable('rgb_full', values=['r', 'g', 'b']),
26+
np.array([0, 1, 1, 1, 2], dtype=float),
27+
)
28+
rgb_missing = VarDataPair(
29+
DiscreteVariable('rgb_missing', values=['r', 'g', 'b']),
30+
np.array([0, 1, 1, np.nan, 2], dtype=float),
31+
)
32+
33+
# Ordered discrete variable variations
34+
ints_full = VarDataPair(
35+
DiscreteVariable('ints_full', values=['2', '3', '4'], ordered=True),
36+
np.array([0, 1, 1, 1, 2], dtype=float),
37+
)
38+
ints_missing = VarDataPair(
39+
DiscreteVariable('ints_missing', values=['2', '3', '4'], ordered=True),
40+
np.array([0, 1, 1, np.nan, 2], dtype=float),
41+
)
42+
43+
def _to_timestamps(years):
44+
return [datetime.datetime(year, 1, 1).timestamp() if not np.isnan(year)
45+
else np.nan for year in years]
46+
47+
time_full = VarDataPair(
48+
TimeVariable('time_full'),
49+
np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float),
50+
)
51+
time_missing = VarDataPair(
52+
TimeVariable('time_missing'),
53+
np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float),
54+
)
55+
56+
# String variable variations
57+
string_full = VarDataPair(
58+
StringVariable('string_full'),
59+
np.array(['a', 'b', 'c', 'd', 'e'], dtype=object),
60+
)
61+
string_missing = VarDataPair(
62+
StringVariable('string_missing'),
63+
np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object),
64+
)
65+
66+
67+
def make_table(attributes, target=None, metas=None):
68+
"""Build an instance of a table given various variables.
69+
70+
Parameters
71+
----------
72+
attributes : Iterable[Tuple[Variable, np.array]
73+
target : Optional[Iterable[Tuple[Variable, np.array]]
74+
metas : Optional[Iterable[Tuple[Variable, np.array]]
75+
76+
Returns
77+
-------
78+
Table
79+
80+
"""
81+
attribute_vars, attribute_vals = list(zip(*attributes))
82+
attribute_vals = np.array(attribute_vals).T
83+
84+
target_vars, target_vals = None, None
85+
if target is not None:
86+
target_vars, target_vals = list(zip(*target))
87+
target_vals = np.array(target_vals).T
88+
89+
meta_vars, meta_vals = None, None
90+
if metas is not None:
91+
meta_vars, meta_vals = list(zip(*metas))
92+
meta_vals = np.array(meta_vals).T
93+
94+
return Table.from_numpy(
95+
Domain(attribute_vars, class_vars=target_vars, metas=meta_vars),
96+
X=attribute_vals, Y=target_vals, metas=meta_vals,
97+
)
98+
99+
100+
class TestUtils(unittest.TestCase):
101+
def test_details(self):
102+
"""Check if details part of the summary is formatted correctly"""
103+
data = Table('zoo')
104+
n_features = len(data.domain.variables) + len(data.domain.metas)
105+
details = f'{len(data)} instances, ' \
106+
f'{n_features} features\n' \
107+
f'Features: {len(data.domain.attributes)} categorical\n' \
108+
f'Target: categorical\n' \
109+
f'Metas: string (not shown)'
110+
self.assertEqual(details, format_summary_details(data))
111+
112+
data = Table('housing')
113+
n_features = len(data.domain.variables) + len(data.domain.metas)
114+
details = f'{len(data)} instances, ' \
115+
f'{n_features} features\n' \
116+
f'Features: {len(data.domain.attributes)} numeric\n' \
117+
f'Target: numeric\n' \
118+
f'Metas: —'
119+
self.assertEqual(details, format_summary_details(data))
120+
121+
data = Table('heart_disease')
122+
n_features = len(data.domain.variables) + len(data.domain.metas)
123+
details = f'{len(data)} instances, ' \
124+
f'{n_features} features\n' \
125+
f'Features: {len(data.domain.attributes)} ' \
126+
f'(7 categorical, 6 numeric)\n' \
127+
f'Target: categorical\n' \
128+
f'Metas: —'
129+
self.assertEqual(details, format_summary_details(data))
130+
131+
data = make_table(
132+
[continuous_full, continuous_missing],
133+
target=[rgb_full, rgb_missing], metas=[ints_full, ints_missing]
134+
)
135+
n_features = len(data.domain.variables) + len(data.domain.metas)
136+
details = f'{len(data)} instances, ' \
137+
f'{n_features} features\n' \
138+
f'Features: {len(data.domain.attributes)} numeric\n' \
139+
f'Target: {len(data.domain.class_vars)} categorical\n' \
140+
f'Metas: {len(data.domain.metas)} categorical'
141+
self.assertEqual(details, format_summary_details(data))
142+
143+
data = make_table(
144+
[continuous_full, time_full, ints_full, rgb_missing],
145+
target=[rgb_full, continuous_missing],
146+
metas=[string_full, string_missing]
147+
)
148+
n_features = len(data.domain.variables) + len(data.domain.metas)
149+
details = f'{len(data)} instances, ' \
150+
f'{n_features} features\n' \
151+
f'Features: {len(data.domain.attributes)} ' \
152+
f'(2 categorical, 1 numeric, 1 time)\n' \
153+
f'Target: {len(data.domain.class_vars)} ' \
154+
f'(1 categorical, 1 numeric)\n' \
155+
f'Metas: {len(data.domain.metas)} string (not shown)'
156+
self.assertEqual(details, format_summary_details(data))
157+
158+
data = make_table([time_full, time_missing], target=[ints_missing],
159+
metas=None)
160+
details = f'{len(data)} instances, ' \
161+
f'{len(data.domain.variables)} features\n' \
162+
f'Features: {len(data.domain.attributes)} time\n'\
163+
f'Target: categorical\n' \
164+
f'Metas: —'
165+
self.assertEqual(details, format_summary_details(data))
166+
167+
data = make_table([rgb_full, ints_full], target=None, metas=None)
168+
details = f'{len(data)} instances, ' \
169+
f'{len(data.domain.variables)} features\n' \
170+
f'Features: {len(data.domain.variables)} categorical\n' \
171+
f'Target: —\n' \
172+
f'Metas: —'
173+
self.assertEqual(details, format_summary_details(data))
174+
175+
data = make_table([rgb_full], target=None, metas=None)
176+
details = f'{len(data)} instances, ' \
177+
f'{len(data.domain.variables)} feature\n' \
178+
f'Features: categorical\n' \
179+
f'Target: —\n' \
180+
f'Metas: —'
181+
self.assertEqual(details, format_summary_details(data))
182+
183+
data = None
184+
self.assertEqual('', format_summary_details(data))

0 commit comments

Comments
 (0)