Skip to content

Commit 38f42d8

Browse files
mstrazarkernc
authored andcommitted
Venn diagram on sparse data (#2334)
[ENH] Venn diagram on sparse data
1 parent 6758462 commit 38f42d8

File tree

2 files changed

+66
-36
lines changed

2 files changed

+66
-36
lines changed

Orange/widgets/visualize/owvenndiagram.py

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from AnyQt.QtCore import pyqtSignal as Signal
2525

2626
import Orange.data
27+
import Orange.statistics.util as util
2728
from Orange.widgets import widget, gui, settings
2829
from Orange.widgets.utils import itemmodels, colorpalette
2930
from Orange.widgets.utils.annotated_data import (create_annotated_table,
@@ -179,7 +180,7 @@ def handleNewSignals(self):
179180
has_identifiers = all(source_attributes(input.table.domain)
180181
for input in self.data.values())
181182
has_any_identifiers = any(source_attributes(input.table.domain)
182-
for input in self.data.values())
183+
for input in self.data.values())
183184
self.useequalityButton.setEnabled(samedomain)
184185
self.useidentifiersButton.setEnabled(
185186
has_any_identifiers or len(self.data) == 0)
@@ -499,7 +500,7 @@ def _on_inputAttrActivated(self, attr_index):
499500
index = i
500501
break
501502

502-
assert (index is not None)
503+
assert index is not None
503504

504505
key, _ = inputs[index]
505506

@@ -718,13 +719,9 @@ def table_concat(tables):
718719
variables_seen.update(table.domain.metas)
719720

720721
domain = Orange.data.Domain(attributes, class_vars, metas)
721-
new_table = Orange.data.Table(domain)
722-
723-
for table in tables:
724-
new_table.extend(Orange.data.Table.from_table(domain, table))
725-
new_table.attributes.update(table.attributes)
726722

727-
return new_table
723+
tables = [tab.transform(domain) for tab in tables]
724+
return tables[0].concatenate(tables, axis=0)
728725

729726

730727
def copy_descriptor(descriptor, newname=None):
@@ -793,10 +790,12 @@ def inst_key(inst, vars):
793790
# each instance in this list belongs to one group (but not all
794791
# groups need to be present).
795792
inst_by_id = defaultdict(list)
793+
id_by_inst = defaultdict(list) # inverse mapping
796794

797795
for i in range(len(table)):
798796
inst_id = instance_ids[i]
799797
inst_by_id[inst_id].append(i)
798+
id_by_inst[i] = inst_id
800799

801800
newfeatures = []
802801
newclass_vars = []
@@ -834,21 +833,27 @@ def expanded(feat):
834833

835834
domain = Orange.data.Domain(newfeatures, newclass_vars, newmetas)
836835
prototype_indices = [inst_by_id[inst_id][0] for inst_id in ids]
837-
newtable = Orange.data.Table.from_table(domain, table)[prototype_indices]
836+
newtable = table[prototype_indices].transform(domain)
838837
in_expanded = set(f for efd in expanded_features.values() for f in efd.values())
839838

839+
# Fill-in nan values
840+
for var in domain.variables + domain.metas:
841+
if var in idvarlist or var in in_expanded:
842+
continue
843+
col, _ = newtable.get_column_view(var)
844+
nan_indices = (i for i in col.nonzero()[0]
845+
if isinstance(col[i], str) or numpy.isnan(col[i]))
846+
for i in nan_indices:
847+
for ind in inst_by_id[ids[i]]:
848+
if not numpy.isnan(table[ind, var]):
849+
newtable[i, var] = table[ind, var]
850+
break
851+
852+
# Fill-in expanded features if any
840853
for i, inst_id in enumerate(ids):
841854
indices = inst_by_id[inst_id]
842855
instance = newtable[i]
843856

844-
for var in domain.variables + domain.metas:
845-
if var in idvarlist or var in in_expanded:
846-
continue
847-
if numpy.isnan(instance[var]):
848-
for ind in indices:
849-
if not numpy.isnan(table[ind, var]):
850-
newtable[i, var] = table[ind, var]
851-
852857
for index in indices:
853858
source_inst = table[index]
854859
group = instance_groups[index]
@@ -896,14 +901,13 @@ def varying_between(table, idvar):
896901
for indices in idmap.values():
897902
subset = table[indices]
898903
for var in list(candidate_set):
899-
values, _ = subset.get_column_view(var)
904+
column, _ = subset.get_column_view(var)
905+
values = util.unique(column)
900906

901-
if var.is_string:
902-
uniq = set(values)
903-
else:
904-
uniq = unique_non_nan(values)
907+
if not var.is_string:
908+
values = unique_non_nan(values)
905909

906-
if len(uniq) > 1:
910+
if len(values) > 1:
907911
varying.add(var)
908912
candidate_set.remove(var)
909913

@@ -937,16 +941,14 @@ def string_attributes(domain):
937941
"""
938942
Return all string attributes from the domain.
939943
"""
940-
return [attr for attr in domain.variables + domain.metas
941-
if attr.is_string]
944+
return [attr for attr in domain.variables + domain.metas if attr.is_string]
942945

943946

944947
def discrete_attributes(domain):
945948
"""
946949
Return all discrete attributes from the domain.
947950
"""
948-
return [attr for attr in domain.variables + domain.metas
949-
if attr.is_discrete]
951+
return [attr for attr in domain.variables + domain.metas if attr.is_discrete]
950952

951953

952954
def source_attributes(domain):
@@ -1437,8 +1439,7 @@ def subset_anchors(shapes):
14371439
unit_point(270, r=0.35), # C
14381440
unit_point(210, r=0.27), # AC
14391441
unit_point(330, r=0.27), # BC
1440-
unit_point(0, r=0), # ABC
1441-
]
1442+
unit_point(0, r=0),] # ABC
14421443
elif n == 4:
14431444
anchors = [
14441445
(0.400, 0.110), # A
@@ -1621,7 +1622,6 @@ def append_column(data, where, variable, column):
16211622
attr = domain.attributes
16221623
class_vars = domain.class_vars
16231624
metas = domain.metas
1624-
16251625
if where == "X":
16261626
attr = attr + (variable,)
16271627
X = numpy.hstack((X, column))
@@ -1634,10 +1634,9 @@ def append_column(data, where, variable, column):
16341634
else:
16351635
raise ValueError
16361636
domain = Orange.data.Domain(attr, class_vars, metas)
1637-
table = Orange.data.Table.from_numpy(domain, X, Y, M, W if W.size else None)
1638-
table.ids = data.ids
1639-
table.attributes = data.attributes
1640-
return table
1637+
new_data = data.transform(domain)
1638+
new_data[:, variable] = column
1639+
return new_data
16411640

16421641

16431642
def drop_columns(data, columns):

Orange/widgets/visualize/tests/test_owvenndiagram.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import unittest
55
import numpy as np
6+
import scipy.sparse as sp
67
from collections import defaultdict
78

89
from Orange.data import (Table, Domain, StringVariable,
@@ -74,8 +75,8 @@ def test_venn_diagram(self):
7475
source_var, item_id_var)
7576
temp_m = np.array([[cv[0, i], sources[i], table.metas[0 + i, 0]],
7677
[cv[1, i], sources[i], table.metas[1 + i, 0]],
77-
[cv[2, i], sources[i], table.metas[2 + i, 0]]
78-
], dtype=object)
78+
[cv[2, i], sources[i], table.metas[2 + i, 0]]],
79+
dtype=object)
7980
temp_table = self.add_metas(temp_table, temp_d, temp_m)
8081
tables.append(temp_table)
8182

@@ -178,6 +179,36 @@ def test_no_data(self):
178179

179180

180181
class GroupTableIndicesTest(unittest.TestCase):
182+
183+
def test_varying_between_combined(self):
184+
X = np.array([[0, 0, 0, 0, 0, 1,],
185+
[0, 0, 1, 1, 0, 1,],
186+
[0, 0, 0, 2, np.nan, np.nan,],
187+
[0, 1, 0, 0, 0, 0,],
188+
[0, 1, 0, 2, 0, 0,],
189+
[0, 1, 0, 0, np.nan, 0,]])
190+
191+
M = np.array([["A", 0, 0, 0, 0, 0, 1,],
192+
["A", 0, 0, 1, 1, 0, 1,],
193+
["A", 0, 0, 0, 2, np.nan, np.nan,],
194+
["B", 0, 1, 0, 0, 0, 0,],
195+
["B", 0, 1, 0, 2, 0, 0,],
196+
["B", 0, 1, 0, 0, np.nan, 0,]], dtype=str)
197+
198+
variables = [ContinuousVariable(name="F%d" % j) for j in range(X.shape[1])]
199+
metas = [StringVariable(name="M%d" % j) for j in range(M.shape[1])]
200+
domain = Domain(attributes=variables, metas=metas)
201+
202+
data = Table.from_numpy(X=X, domain=domain, metas=M)
203+
204+
self.assertEqual(varying_between(data, idvar=data.domain.metas[0]),
205+
[variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]])
206+
207+
data = Table.from_numpy(X=sp.csr_matrix(X), domain=domain, metas=M)
208+
self.assertEqual(varying_between(data, idvar=data.domain.metas[0]),
209+
[variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]])
210+
211+
181212
def test_group_table_indices(self):
182213
table = Table(test_filename("test9.tab"))
183214
dd = defaultdict(list)

0 commit comments

Comments
 (0)