Skip to content

Commit 89f31fc

Browse files
authored
Merge pull request #4231 from PrimozGodec/fix-box-plot
[FIX] Various fixes of box plot
2 parents c58eaa4 + 7f79121 commit 89f31fc

File tree

8 files changed

+547
-505
lines changed

8 files changed

+547
-505
lines changed

Orange/data/_contingency.c

Lines changed: 448 additions & 490 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Orange/data/_contingency.pyx

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,16 @@ def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray
4343
i = ranks[i]
4444
v = col_data[i]
4545
tc = classes[i]
46+
if v != last and not npy_isnan(v):
47+
j += 1
48+
V[j] = v
49+
last = v
4650
if npy_isnan(v) and npy_isnan(tc):
4751
unknowns += W[i] if weights else 1.
4852
elif npy_isnan(tc):
4953
row_unknowns[j] += W[i] if weights else 1.
5054
elif npy_isnan(v):
5155
col_unknowns[int(tc)] += W[i] if weights else 1.
52-
elif v != last:
53-
j += 1
54-
V[j] = v
55-
last = v
56-
C[int(tc),j] += W[i] if weights else 1.
5756
else:
5857
C[int(tc),j] += W[i] if weights else 1.
5958

Orange/data/table.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import bottleneck as bn
1212
import numpy as np
1313
from scipy import sparse as sp
14+
from scipy.sparse import issparse
1415

1516
import Orange.data # import for io.py
1617
from Orange.data import (
@@ -1437,6 +1438,11 @@ def _compute_contingency(self, col_vars=None, row_var=None):
14371438
raise ValueError("contingency can be computed only for discrete "
14381439
"and continuous values")
14391440

1441+
# when we select a column in sparse matrix it is still two dimensional
1442+
# and sparse - since it is just a column we can afford to transform
1443+
# it to dense and make it 1D
1444+
if issparse(row_data):
1445+
row_data = row_data.toarray().ravel()
14401446
if row_data.dtype.kind != "f": #meta attributes can be stored as type object
14411447
row_data = row_data.astype(float)
14421448

Orange/statistics/distribution.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,19 @@ def from_data(cls, data, variable):
130130
self.variable = variable
131131
return self
132132

133+
@property
134+
def array_with_unknowns(self):
135+
"""
136+
This property returns a distribution array with unknowns added
137+
at the end
138+
139+
Returns
140+
-------
141+
np.array
142+
Array with appended unknowns at the end of the row.
143+
"""
144+
return np.append(np.array(self), self.unknowns)
145+
133146
def __getitem__(self, index):
134147
if isinstance(index, str):
135148
index = self.variable.to_val(index)

Orange/tests/test_contingency.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import numpy as np
88
import scipy.sparse as sp
9+
from scipy.sparse import csr_matrix, csc_matrix
910

1011
from Orange.data import DiscreteVariable, Table, Domain
1112
from Orange.statistics import contingency
@@ -25,6 +26,7 @@ class TestDiscrete(unittest.TestCase):
2526
@classmethod
2627
def setUpClass(cls):
2728
cls.zoo = data.Table("zoo")
29+
cls.test9 = data.Table(test_filename("datasets/test9.tab"))
2830

2931
def test_discrete(self):
3032
cont = contingency.Discrete(self.zoo, 0)
@@ -153,6 +155,16 @@ def test_continuous_missing(self):
153155
0., 0., 0., 0., 0., 0., 0.])
154156
self.assertEqual(cont.unknowns, 1)
155157

158+
# this one was failing before since the issue in _contingecy.pyx
159+
d.Y[:50] = np.zeros(50) * float("nan")
160+
cont = contingency.Continuous(d, "sepal width")
161+
np.testing.assert_almost_equal(cont.col_unknowns, [0, 0, 0])
162+
np.testing.assert_almost_equal(
163+
cont.row_unknowns,
164+
[0., 0., 1., 0., 0., 0., 0., 0., 1., 5., 5., 5., 2., 9., 6., 2.,
165+
3., 4., 2., 1., 1., 1., 1.])
166+
self.assertEqual(cont.unknowns, 1)
167+
156168
def test_mixedtype_metas(self):
157169
import Orange
158170
zoo = Orange.data.Table("zoo")
@@ -286,12 +298,35 @@ def test_get_contingencies(self):
286298
assert_dist_equal(cont[2], [1, 0, 0])
287299

288300
def test_compute_contingency_metas(self):
289-
d = data.Table(test_filename("datasets/test9.tab"))
290-
var1, var2 = d.domain[-2], d.domain[-4]
291-
cont = d._compute_contingency([var1], var2)[0][0]
301+
var1, var2 = self.test9.domain[-2], self.test9.domain[-4]
302+
cont = contingency.Discrete(self.test9, var1, var2)
292303
assert_dist_equal(cont, [[3, 0, 0], [0, 2, 0],
293304
[0, 0, 2], [0, 1, 0]])
294305

306+
def test_compute_contingency_row_attribute_sparse(self):
307+
"""
308+
Testing with sparse row variable since currently we do not test the
309+
situation when a row variable is sparse.
310+
"""
311+
d = self.test9
312+
# make X sparse
313+
d.X = csr_matrix(d.X)
314+
var1, var2 = d.domain[0], d.domain[1]
315+
cont = contingency.Discrete(d, var1, var2)
316+
assert_dist_equal(cont, [[1, 0], [1, 0], [1, 0], [1, 0],
317+
[0, 1], [0, 1], [0, 1], [0, 1]])
318+
cont = contingency.Discrete(d, var2, var1)
319+
assert_dist_equal(cont, [[1, 1, 1, 1, 0, 0, 0, 0],
320+
[0, 0, 0, 0, 1, 1, 1, 1]])
321+
322+
d.X = csc_matrix(d.X)
323+
cont = contingency.Discrete(d, var1, var2)
324+
assert_dist_equal(cont, [[1, 0], [1, 0], [1, 0], [1, 0],
325+
[0, 1], [0, 1], [0, 1], [0, 1]])
326+
cont = contingency.Discrete(d, var2, var1)
327+
assert_dist_equal(cont, [[1, 1, 1, 1, 0, 0, 0, 0],
328+
[0, 0, 0, 0, 1, 1, 1, 1]])
329+
295330
def test_compute_contingency_invalid(self):
296331
rstate = np.random.RandomState(0xFFFF)
297332
X = data.ContinuousVariable("X")

Orange/tests/test_distribution.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,17 @@ def test_min_max(self):
204204
self.assertEqual(self.num.min(), '1')
205205
self.assertEqual(self.num.max(), '3')
206206

207+
def test_array_with_unknowns(self):
208+
d = data.Table("zoo")
209+
d.Y[0] = np.nan
210+
disc = distribution.Discrete(d, "type")
211+
self.assertIsInstance(disc, np.ndarray)
212+
self.assertEqual(disc.unknowns, 1)
213+
true_freq = [4., 20., 13., 8., 10., 40., 5.]
214+
assert_dist_equal(disc, true_freq)
215+
np.testing.assert_array_equal(disc.array_with_unknowns,
216+
np.append(true_freq, 1))
217+
207218

208219
class TestContinuousDistribution(unittest.TestCase):
209220
@classmethod

Orange/widgets/visualize/owboxplot.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import Orange.data
1717
from Orange.data.filter import FilterDiscrete, FilterContinuous, Values
1818
from Orange.statistics import contingency, distribution
19-
from Orange.statistics.contingency import Discrete
2019

2120
from Orange.widgets import widget, gui
2221
from Orange.widgets.settings import (Setting, DomainContextHandler,
@@ -647,15 +646,14 @@ def _display_changed_disc(self):
647646
[self.strudel(cont, i)
648647
for i, cont in enumerate(self.conts.array_with_unknowns)
649648
if np.sum(cont) > 0]
650-
self.conts = self.conts[np.sum(np.array(self.conts), axis=1) > 0]
651649

652650
if self.sort_freqs:
653651
# pylint: disable=invalid-unary-operand-type
654652
self.order = sorted(
655653
self.order, key=(-np.sum(
656654
self.conts.array_with_unknowns, axis=1)).__getitem__)
657655
else:
658-
self.boxes = [self.strudel(self.dist, self.dist.unknowns)]
656+
self.boxes = [self.strudel(self.dist.array_with_unknowns)]
659657

660658
for row, box_index in enumerate(self.order):
661659
y = (-len(self.boxes) + row) * 40 + 10
@@ -921,9 +919,10 @@ def draw_axis_disc(self):
921919
step = steps = 10
922920
else:
923921
if self.group_var:
924-
max_box = max(float(np.sum(dist)) for dist in self.conts)
922+
max_box = max(float(np.sum(dist))
923+
for dist in self.conts.array_with_unknowns)
925924
else:
926-
max_box = float(np.sum(self.dist))
925+
max_box = float(np.sum(self.dist.array_with_unknowns))
927926
if max_box == 0:
928927
self.scale_x = 1
929928
return
@@ -944,7 +943,7 @@ def draw_axis_disc(self):
944943
right_offset = 0 # offset for the right label
945944
if not self.show_stretched and self.labels:
946945
if self.group_var:
947-
rows = list(zip(self.conts, self.labels))
946+
rows = list(zip(self.conts.array_with_unknowns, self.labels))
948947
else:
949948
rows = [(self.dist, self.labels[0])]
950949
# available space left of the 'group labels'

Orange/widgets/visualize/tests/test_owboxplot.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import numpy as np
77
from AnyQt.QtCore import QItemSelectionModel
88

9-
from Orange.data import Table, ContinuousVariable, StringVariable, Domain
9+
from Orange.data import Table, ContinuousVariable, StringVariable, Domain, \
10+
DiscreteVariable
1011
from Orange.widgets.visualize.owboxplot import (
1112
OWBoxPlot, FilterGraphicsRectItem, _quantiles
1213
)
@@ -299,6 +300,26 @@ def test_stretching(self):
299300
self.__select_group("chest pain")
300301
self.assertTrue(enabled())
301302

303+
def test_value_all_missing_for_group(self):
304+
"""
305+
This is one of the extreme cases when we have a subgroup value
306+
where all values in selected variable are missing. Box plot should
307+
handle this.
308+
"""
309+
data = Table(Domain([DiscreteVariable("a", values=["v1", "v2", "v3"]),
310+
DiscreteVariable("b", values=["v3", "v4"])]),
311+
[[0., 0.],
312+
[0., 1.],
313+
[1., np.nan],
314+
[1., np.nan],
315+
[2., 0.],
316+
[2., 0.]])
317+
self.send_signal(self.widget.Inputs.data, data)
318+
319+
self.__select_variable("b")
320+
self.__select_group("a")
321+
self.assertTupleEqual(self.widget.conts.shape, (3, 2))
322+
302323

303324
class TestUtils(unittest.TestCase):
304325
def test(self):

0 commit comments

Comments
 (0)