From 933102b3a603c9b620b7338cb6ba8642b5d59d32 Mon Sep 17 00:00:00 2001 From: mstrazar Date: Mon, 22 May 2017 17:00:55 +0200 Subject: [PATCH 1/5] Rewrite verying_between and reshape_wide to work on sparse data efficiently. Add a TODO. --- Orange/widgets/visualize/owvenndiagram.py | 54 +++++++++++++---------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/Orange/widgets/visualize/owvenndiagram.py b/Orange/widgets/visualize/owvenndiagram.py index 8108d20db2f..80309ba7409 100644 --- a/Orange/widgets/visualize/owvenndiagram.py +++ b/Orange/widgets/visualize/owvenndiagram.py @@ -24,6 +24,7 @@ from AnyQt.QtCore import pyqtSignal as Signal import Orange.data +import Orange.statistics.util as util from Orange.widgets import widget, gui, settings from Orange.widgets.utils import itemmodels, colorpalette from Orange.widgets.utils.annotated_data import (create_annotated_table, @@ -713,12 +714,13 @@ def table_concat(tables): variables_seen.update(table.domain.metas) domain = Orange.data.Domain(attributes, class_vars, metas) - new_table = Orange.data.Table(domain) - for table in tables: - new_table.extend(Orange.data.Table.from_table(domain, table)) - new_table.attributes.update(table.attributes) + tables = list(map(lambda tab: tab.transform(domain), tables)) + # TODO: remove castint to Table when Corpus.concatenate becomes available + new_table = Orange.data.Table.from_table(domain, + Orange.data.Table.concatenate(tables, axis=0) + ) return new_table @@ -788,10 +790,12 @@ def inst_key(inst, vars): # each instance in this list belongs to one group (but not all # groups need to be present). inst_by_id = defaultdict(list) + id_by_inst = defaultdict(list) # inverse mapping for i in range(len(table)): inst_id = instance_ids[i] inst_by_id[inst_id].append(i) + id_by_inst[i] = inst_id newfeatures = [] newclass_vars = [] @@ -829,21 +833,26 @@ def expanded(feat): domain = Orange.data.Domain(newfeatures, newclass_vars, newmetas) prototype_indices = [inst_by_id[inst_id][0] for inst_id in ids] - newtable = Orange.data.Table.from_table(domain, table)[prototype_indices] + newtable = table[prototype_indices].transform(domain) in_expanded = set(f for efd in expanded_features.values() for f in efd.values()) + # Fill-in nan values + for var in domain.variables + domain.metas: + if var in idvarlist or var in in_expanded: + continue + col, _ = newtable.get_column_view(var) + nan_indices = filter(lambda j: isinstance(col[j], str) or numpy.isnan(col[j]), col.nonzero()[0]) + for i in nan_indices: + for ind in inst_by_id[ids[i]]: + if not numpy.isnan(table[ind, var]): + newtable[i, var] = table[ind, var] + break + + # Fill-in expanded features if any for i, inst_id in enumerate(ids): indices = inst_by_id[inst_id] instance = newtable[i] - for var in domain.variables + domain.metas: - if var in idvarlist or var in in_expanded: - continue - if numpy.isnan(instance[var]): - for ind in indices: - if not numpy.isnan(table[ind, var]): - newtable[i, var] = table[ind, var] - for index in indices: source_inst = table[index] group = instance_groups[index] @@ -891,14 +900,13 @@ def varying_between(table, idvar): for indices in idmap.values(): subset = table[indices] for var in list(candidate_set): - values, _ = subset.get_column_view(var) + column, _ = subset.get_column_view(var) + values = util.unique(column) - if var.is_string: - uniq = set(values) - else: - uniq = unique_non_nan(values) + if not var.is_string: + values = unique_non_nan(values) - if len(uniq) > 1: + if len(values) > 1: varying.add(var) candidate_set.remove(var) @@ -1616,7 +1624,6 @@ def append_column(data, where, variable, column): attr = domain.attributes class_vars = domain.class_vars metas = domain.metas - if where == "X": attr = attr + (variable,) X = numpy.hstack((X, column)) @@ -1629,10 +1636,9 @@ def append_column(data, where, variable, column): else: raise ValueError domain = Orange.data.Domain(attr, class_vars, metas) - table = Orange.data.Table.from_numpy(domain, X, Y, M, W if W.size else None) - table.ids = data.ids - table.attributes = data.attributes - return table + new_data = data.transform(domain) + new_data[:,variable] = column + return new_data def drop_columns(data, columns): From 533e8139473536d149e000d9973fc58fc75f39af Mon Sep 17 00:00:00 2001 From: mstrazar Date: Fri, 26 May 2017 13:11:24 +0200 Subject: [PATCH 2/5] Added a test for varying between. --- .../visualize/tests/test_owvenndiagram.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Orange/widgets/visualize/tests/test_owvenndiagram.py b/Orange/widgets/visualize/tests/test_owvenndiagram.py index 54632012b4d..3d539714b2a 100644 --- a/Orange/widgets/visualize/tests/test_owvenndiagram.py +++ b/Orange/widgets/visualize/tests/test_owvenndiagram.py @@ -3,6 +3,7 @@ import unittest import numpy as np +import scipy.sparse as sp from collections import defaultdict from Orange.data import (Table, Domain, StringVariable, @@ -179,6 +180,36 @@ def test_no_data(self): class GroupTableIndicesTest(unittest.TestCase): + + def test_varying_between_combined(self): + X = np.array([[0, 0, 0, 0, 0, 1, ], + [0, 0, 1, 1, 0, 1, ], + [0, 0, 0, 2, np.nan, np.nan, ], + [0, 1, 0, 0, 0, 0, ], + [0, 1, 0, 2, 0, 0, ], + [0, 1, 0, 0, np.nan, 0, ]]) + + M = np.array([["A", 0, 0, 0, 0, 0, 1, ], + ["A", 0, 0, 1, 1, 0, 1, ], + ["A", 0, 0, 0, 2, np.nan, np.nan, ], + ["B", 0, 1, 0, 0, 0, 0, ], + ["B", 0, 1, 0, 2, 0, 0, ], + ["B", 0, 1, 0, 0, np.nan, 0, ]], dtype=str) + + variables = [ContinuousVariable(name="F%d" % j) for j in range(X.shape[1])] + metas = [StringVariable(name="M%d" % j) for j in range(M.shape[1])] + domain = Domain(attributes=variables, metas=metas) + + data = Table.from_numpy(X=X, domain=domain, metas=M) + + self.assertEqual(varying_between(data, idvar=data.domain.metas[0]), + [variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]]) + + data = Table.from_numpy(X=sp.csr_matrix(X), domain=domain, metas=M) + self.assertEqual(varying_between(data, idvar=data.domain.metas[0]), + [variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]]) + + def test_group_table_indices(self): table = Table(test_filename("test9.tab")) dd = defaultdict(list) From d9760e17cee4366b30e20bcabd48a63d67cbb71e Mon Sep 17 00:00:00 2001 From: mstrazar Date: Fri, 26 May 2017 14:36:19 +0200 Subject: [PATCH 3/5] Formatting issues. --- Orange/widgets/visualize/owvenndiagram.py | 22 +++++++------- .../visualize/tests/test_owvenndiagram.py | 29 +++++++++---------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/Orange/widgets/visualize/owvenndiagram.py b/Orange/widgets/visualize/owvenndiagram.py index 80309ba7409..a8240dcfcdb 100644 --- a/Orange/widgets/visualize/owvenndiagram.py +++ b/Orange/widgets/visualize/owvenndiagram.py @@ -173,9 +173,9 @@ def handleNewSignals(self): self.samedomain = samedomain has_identifiers = all(source_attributes(input.table.domain) - for input in self.data.values()) + for input in self.data.values()) has_any_identifiers = any(source_attributes(input.table.domain) - for input in self.data.values()) + for input in self.data.values()) self.useequalityButton.setEnabled(samedomain) self.useidentifiersButton.setEnabled( has_any_identifiers or len(self.data) == 0) @@ -495,7 +495,7 @@ def _on_inputAttrActivated(self, attr_index): index = i break - assert (index is not None) + assert index is not None key, _ = inputs[index] @@ -717,10 +717,9 @@ def table_concat(tables): tables = list(map(lambda tab: tab.transform(domain), tables)) - # TODO: remove castint to Table when Corpus.concatenate becomes available + # TODO: remove casting to Table when Corpus.concatenate becomes available new_table = Orange.data.Table.from_table(domain, - Orange.data.Table.concatenate(tables, axis=0) - ) + Orange.data.Table.concatenate(tables, axis=0)) return new_table @@ -841,7 +840,8 @@ def expanded(feat): if var in idvarlist or var in in_expanded: continue col, _ = newtable.get_column_view(var) - nan_indices = filter(lambda j: isinstance(col[j], str) or numpy.isnan(col[j]), col.nonzero()[0]) + nan_indices = filter(lambda j: isinstance(col[j], str) or numpy.isnan(col[j]), + col.nonzero()[0]) for i in nan_indices: for ind in inst_by_id[ids[i]]: if not numpy.isnan(table[ind, var]): @@ -948,8 +948,7 @@ def discrete_attributes(domain): """ Return all discrete attributes from the domain. """ - return [attr for attr in domain.variables + domain.metas - if attr.is_discrete] + return [attr for attr in domain.variables + domain.metas if attr.is_discrete] def source_attributes(domain): @@ -1440,8 +1439,7 @@ def subset_anchors(shapes): unit_point(270, r=0.35), # C unit_point(210, r=0.27), # AC unit_point(330, r=0.27), # BC - unit_point(0, r=0), # ABC - ] + unit_point(0, r=0),] # ABC elif n == 4: anchors = [ (0.400, 0.110), # A @@ -1637,7 +1635,7 @@ def append_column(data, where, variable, column): raise ValueError domain = Orange.data.Domain(attr, class_vars, metas) new_data = data.transform(domain) - new_data[:,variable] = column + new_data[:, variable] = column return new_data diff --git a/Orange/widgets/visualize/tests/test_owvenndiagram.py b/Orange/widgets/visualize/tests/test_owvenndiagram.py index 3d539714b2a..8e1355c246c 100644 --- a/Orange/widgets/visualize/tests/test_owvenndiagram.py +++ b/Orange/widgets/visualize/tests/test_owvenndiagram.py @@ -76,8 +76,7 @@ def test_venn_diagram(self): source_var, item_id_var) temp_m = np.array([[cv[0, i], sources[i], table.metas[0 + i, 0]], [cv[1, i], sources[i], table.metas[1 + i, 0]], - [cv[2, i], sources[i], table.metas[2 + i, 0]] - ], dtype=object) + [cv[2, i], sources[i], table.metas[2 + i, 0]]], dtype=object) temp_table = self.add_metas(temp_table, temp_d, temp_m) tables.append(temp_table) @@ -182,19 +181,19 @@ def test_no_data(self): class GroupTableIndicesTest(unittest.TestCase): def test_varying_between_combined(self): - X = np.array([[0, 0, 0, 0, 0, 1, ], - [0, 0, 1, 1, 0, 1, ], - [0, 0, 0, 2, np.nan, np.nan, ], - [0, 1, 0, 0, 0, 0, ], - [0, 1, 0, 2, 0, 0, ], - [0, 1, 0, 0, np.nan, 0, ]]) - - M = np.array([["A", 0, 0, 0, 0, 0, 1, ], - ["A", 0, 0, 1, 1, 0, 1, ], - ["A", 0, 0, 0, 2, np.nan, np.nan, ], - ["B", 0, 1, 0, 0, 0, 0, ], - ["B", 0, 1, 0, 2, 0, 0, ], - ["B", 0, 1, 0, 0, np.nan, 0, ]], dtype=str) + X = np.array([[0, 0, 0, 0, 0, 1,], + [0, 0, 1, 1, 0, 1,], + [0, 0, 0, 2, np.nan, np.nan,], + [0, 1, 0, 0, 0, 0,], + [0, 1, 0, 2, 0, 0,], + [0, 1, 0, 0, np.nan, 0,]]) + + M = np.array([["A", 0, 0, 0, 0, 0, 1,], + ["A", 0, 0, 1, 1, 0, 1,], + ["A", 0, 0, 0, 2, np.nan, np.nan,], + ["B", 0, 1, 0, 0, 0, 0,], + ["B", 0, 1, 0, 2, 0, 0,], + ["B", 0, 1, 0, 0, np.nan, 0,]], dtype=str) variables = [ContinuousVariable(name="F%d" % j) for j in range(X.shape[1])] metas = [StringVariable(name="M%d" % j) for j in range(M.shape[1])] From 329f8c42bef9b2436dcf5d367a3f301fec842200 Mon Sep 17 00:00:00 2001 From: Lan Zagar Date: Fri, 2 Jun 2017 09:44:36 +0200 Subject: [PATCH 4/5] owvenndiagram: Remove casting to Table --- Orange/widgets/visualize/owvenndiagram.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Orange/widgets/visualize/owvenndiagram.py b/Orange/widgets/visualize/owvenndiagram.py index a8240dcfcdb..370bdf9f59c 100644 --- a/Orange/widgets/visualize/owvenndiagram.py +++ b/Orange/widgets/visualize/owvenndiagram.py @@ -716,11 +716,7 @@ def table_concat(tables): domain = Orange.data.Domain(attributes, class_vars, metas) tables = list(map(lambda tab: tab.transform(domain), tables)) - - # TODO: remove casting to Table when Corpus.concatenate becomes available - new_table = Orange.data.Table.from_table(domain, - Orange.data.Table.concatenate(tables, axis=0)) - return new_table + return tables[0].concatenate(tables, axis=0) def copy_descriptor(descriptor, newname=None): From ec6dcae7f03ef75ada07a94001208400b00811d9 Mon Sep 17 00:00:00 2001 From: Kernc Date: Fri, 2 Jun 2017 14:09:10 +0200 Subject: [PATCH 5/5] Lint --- Orange/widgets/visualize/owvenndiagram.py | 15 +++++++-------- .../widgets/visualize/tests/test_owvenndiagram.py | 3 ++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Orange/widgets/visualize/owvenndiagram.py b/Orange/widgets/visualize/owvenndiagram.py index 370bdf9f59c..70987c09663 100644 --- a/Orange/widgets/visualize/owvenndiagram.py +++ b/Orange/widgets/visualize/owvenndiagram.py @@ -173,9 +173,9 @@ def handleNewSignals(self): self.samedomain = samedomain has_identifiers = all(source_attributes(input.table.domain) - for input in self.data.values()) + for input in self.data.values()) has_any_identifiers = any(source_attributes(input.table.domain) - for input in self.data.values()) + for input in self.data.values()) self.useequalityButton.setEnabled(samedomain) self.useidentifiersButton.setEnabled( has_any_identifiers or len(self.data) == 0) @@ -715,7 +715,7 @@ def table_concat(tables): domain = Orange.data.Domain(attributes, class_vars, metas) - tables = list(map(lambda tab: tab.transform(domain), tables)) + tables = [tab.transform(domain) for tab in tables] return tables[0].concatenate(tables, axis=0) @@ -785,7 +785,7 @@ def inst_key(inst, vars): # each instance in this list belongs to one group (but not all # groups need to be present). inst_by_id = defaultdict(list) - id_by_inst = defaultdict(list) # inverse mapping + id_by_inst = defaultdict(list) # inverse mapping for i in range(len(table)): inst_id = instance_ids[i] @@ -836,8 +836,8 @@ def expanded(feat): if var in idvarlist or var in in_expanded: continue col, _ = newtable.get_column_view(var) - nan_indices = filter(lambda j: isinstance(col[j], str) or numpy.isnan(col[j]), - col.nonzero()[0]) + nan_indices = (i for i in col.nonzero()[0] + if isinstance(col[i], str) or numpy.isnan(col[i])) for i in nan_indices: for ind in inst_by_id[ids[i]]: if not numpy.isnan(table[ind, var]): @@ -936,8 +936,7 @@ def string_attributes(domain): """ Return all string attributes from the domain. """ - return [attr for attr in domain.variables + domain.metas - if attr.is_string] + return [attr for attr in domain.variables + domain.metas if attr.is_string] def discrete_attributes(domain): diff --git a/Orange/widgets/visualize/tests/test_owvenndiagram.py b/Orange/widgets/visualize/tests/test_owvenndiagram.py index 8e1355c246c..2ccd5b2640f 100644 --- a/Orange/widgets/visualize/tests/test_owvenndiagram.py +++ b/Orange/widgets/visualize/tests/test_owvenndiagram.py @@ -76,7 +76,8 @@ def test_venn_diagram(self): source_var, item_id_var) temp_m = np.array([[cv[0, i], sources[i], table.metas[0 + i, 0]], [cv[1, i], sources[i], table.metas[1 + i, 0]], - [cv[2, i], sources[i], table.metas[2 + i, 0]]], dtype=object) + [cv[2, i], sources[i], table.metas[2 + i, 0]]], + dtype=object) temp_table = self.add_metas(temp_table, temp_d, temp_m) tables.append(temp_table)