Skip to content

Commit 297d32d

Browse files
authored
Merge pull request #4076 from VesnaT/merge_data
[FIX] Merge data: Rename variables with duplicated names
2 parents 3491de7 + d89d4bf commit 297d32d

File tree

5 files changed

+107
-35
lines changed

5 files changed

+107
-35
lines changed

Orange/data/tests/test_variable.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,23 @@ def test_copy_copies_attributes(self):
6060
# Attributes of original value should not change
6161
self.assertEqual(var.attributes["a"], "b")
6262

63+
def test_rename(self):
64+
var = self.varcls_modified("x")
65+
var2 = var.copy(name="x2")
66+
self.assertIsInstance(var2, type(var))
67+
self.assertIsNot(var, var2)
68+
self.assertEqual(var2.name, "x2")
69+
var.__dict__.pop("_name")
70+
var2.__dict__.pop("_name")
71+
self.assertDictEqual(var.__dict__, var2.__dict__)
72+
73+
def varcls_modified(self, name):
74+
var = self.varcls(name)
75+
var._compute_value = lambda x: x
76+
var.sparse = True
77+
var.attributes = {"a": 1}
78+
return var
79+
6380

6481
class TestVariable(unittest.TestCase):
6582
@classmethod
@@ -378,6 +395,12 @@ def test_mapper_from_no_values(self):
378395
self.assertRaises(ValueError, mapper, sp.csr_matrix(arr), 0)
379396
self.assertRaises(ValueError, mapper, sp.csc_matrix(arr), 0)
380397

398+
def varcls_modified(self, name):
399+
var = super().varcls_modified(name)
400+
var.values = ["A", "B"]
401+
var.ordered = True
402+
return var
403+
381404

382405
@variabletest(ContinuousVariable)
383406
class TestContinuousVariable(VariableTest):
@@ -419,6 +442,11 @@ def test_colors(self):
419442
a.colors = ((3, 2, 1), (6, 5, 4), True)
420443
self.assertEqual(a.colors, ((3, 2, 1), (6, 5, 4), True))
421444

445+
def varcls_modified(self, name):
446+
var = super().varcls_modified(name)
447+
var.number_of_decimals = 5
448+
return var
449+
422450

423451
@variabletest(StringVariable)
424452
class TestStringVariable(VariableTest):
@@ -538,6 +566,13 @@ def test_have_date_have_time_in_construct(self):
538566
self.assertTrue(var.have_date)
539567
self.assertFalse(var.have_time)
540568

569+
def varcls_modified(self, name):
570+
var = super().varcls_modified(name)
571+
var.number_of_decimals = 5
572+
var.have_date = 1
573+
var.have_time = 1
574+
return var
575+
541576

542577
PickleContinuousVariable = create_pickling_tests(
543578
"PickleContinuousVariable",

Orange/data/variable.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import collections
21
import re
32
import warnings
43
from collections import Iterable
@@ -215,7 +214,8 @@ def __contains__(self, other):
215214

216215
def __hash__(self):
217216
if self.variable.is_discrete:
218-
# It is not possible to hash the id and the domain value to the same number as required by __eq__.
217+
# It is not possible to hash the id and the domain value to the
218+
# same number as required by __eq__.
219219
# hash(1) == hash(Value(DiscreteVariable("var", ["red", "green", "blue"]), 1)) == hash("green")
220220
# User should hash directly ids or domain values instead.
221221
raise TypeError("unhashable type - cannot hash values of discrete variables!")
@@ -451,8 +451,10 @@ def __reduce__(self):
451451
# Use make to unpickle variables.
452452
return make_variable, (self.__class__, self._compute_value, self.name), self.__dict__
453453

454-
def copy(self, compute_value):
455-
var = type(self)(self.name, compute_value=compute_value, sparse=self.sparse)
454+
def copy(self, compute_value=None, name=None, **kwargs):
455+
var = type(self)(name=name or self.name,
456+
compute_value=compute_value or self.compute_value,
457+
sparse=self.sparse, **kwargs)
456458
var.attributes = dict(self.attributes)
457459
return var
458460

@@ -507,6 +509,10 @@ def number_of_decimals(self):
507509
def format_str(self):
508510
return self._format_str
509511

512+
@format_str.setter
513+
def format_str(self, value):
514+
self._format_str = value
515+
510516
@property
511517
def colors(self):
512518
if self._colors is None:
@@ -559,9 +565,12 @@ def repr_val(self, val):
559565

560566
str_val = repr_val
561567

562-
def copy(self, compute_value=None):
563-
var = type(self)(self.name, self.number_of_decimals, compute_value, sparse=self.sparse)
564-
var.attributes = dict(self.attributes)
568+
def copy(self, compute_value=None, name=None, **kwargs):
569+
var = super().copy(compute_value=compute_value, name=name,
570+
number_of_decimals=self.number_of_decimals,
571+
**kwargs)
572+
var.adjust_decimals = self.adjust_decimals
573+
var.format_str = self._format_str
565574
return var
566575

567576

@@ -795,11 +804,9 @@ def ordered_values(values):
795804
except ValueError:
796805
return sorted(values)
797806

798-
def copy(self, compute_value=None):
799-
var = DiscreteVariable(self.name, self.values, self.ordered,
800-
compute_value, sparse=self.sparse)
801-
var.attributes = dict(self.attributes)
802-
return var
807+
def copy(self, compute_value=None, name=None, **_):
808+
return super().copy(compute_value=compute_value, name=name,
809+
values=self.values, ordered=self.ordered)
803810

804811

805812
class StringVariable(Variable):
@@ -913,11 +920,9 @@ def __init__(self, *args, have_date=0, have_time=0, **kwargs):
913920
self.have_date = have_date
914921
self.have_time = have_time
915922

916-
def copy(self, compute_value=None):
917-
copy = super().copy(compute_value=compute_value)
918-
copy.have_date = self.have_date
919-
copy.have_time = self.have_time
920-
return copy
923+
def copy(self, compute_value=None, name=None, **_):
924+
return super().copy(compute_value=compute_value, name=name,
925+
have_date=self.have_date, have_time=self.have_time)
921926

922927
@staticmethod
923928
def _tzre_sub(s, _subtz=re.compile(r'([+-])(\d\d):(\d\d)$').sub):

Orange/widgets/data/owmergedata.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import Orange
1111
from Orange.data import StringVariable, ContinuousVariable, Variable
12-
from Orange.data.util import hstack
12+
from Orange.data.util import hstack, get_unique_names_duplicates
1313
from Orange.widgets import widget, gui
1414
from Orange.widgets.settings import Setting
1515
from Orange.widgets.utils.itemmodels import DomainModel
@@ -217,13 +217,14 @@ class Outputs:
217217
resizing_enabled = False
218218

219219
class Warning(widget.OWWidget.Warning):
220-
duplicate_names = Msg("Duplicate variable names in output.")
220+
renamed_vars = Msg("Some variables have been renamed "
221+
"to avoid duplicates.\n{}")
221222

222223
class Error(widget.OWWidget.Error):
223224
matching_numeric_with_nonnum = Msg(
224-
"Numeric and non-numeric columns ({} and {}) can't be matched.")
225-
matching_index_with_sth = Msg("Row index cannot by matched with {}.")
226-
matching_id_with_sth = Msg("Instance cannot by matched with {}.")
225+
"Numeric and non-numeric columns ({} and {}) cannot be matched.")
226+
matching_index_with_sth = Msg("Row index cannot be matched with {}.")
227+
matching_id_with_sth = Msg("Instance cannot be matched with {}.")
227228
nonunique_left = Msg(
228229
"Some combinations of values on the left appear in multiple rows.\n"
229230
"For this type of merging, every possible combination of values "
@@ -379,19 +380,9 @@ def dataInfoText(data):
379380
f"{len(data.domain) + len(data.domain.metas)} variables"
380381

381382
def commit(self):
382-
self.Error.clear()
383-
self.Warning.duplicate_names.clear()
384-
if not self.data or not self.extra_data:
385-
merged_data = None
386-
else:
387-
merged_data = self.merge()
388-
if merged_data:
389-
merged_domain = merged_data.domain
390-
var_names = [var.name for var in chain(merged_domain.variables,
391-
merged_domain.metas)]
392-
if len(set(var_names)) != len(var_names):
393-
self.Warning.duplicate_names()
394-
self.Outputs.data.send(merged_data)
383+
self.clear_messages()
384+
merged = None if not self.data or not self.extra_data else self.merge()
385+
self.Outputs.data.send(merged)
395386

396387
def send_report(self):
397388
# pylint: disable=invalid-sequence-index
@@ -544,6 +535,7 @@ def _join_table_by_indices(self, reduced_extra, lefti, righti, rightu):
544535
domain = Orange.data.Domain(
545536
*(getattr(self.data.domain, x) + getattr(reduced_extra.domain, x)
546537
for x in ("attributes", "class_vars", "metas")))
538+
domain = self._domain_rename_duplicates(domain)
547539
X = self._join_array_by_indices(self.data.X, reduced_extra.X, lefti, righti)
548540
Y = self._join_array_by_indices(
549541
np.c_[self.data.Y], np.c_[reduced_extra.Y], lefti, righti)
@@ -566,6 +558,29 @@ def _join_table_by_indices(self, reduced_extra, lefti, righti, rightu):
566558

567559
return table
568560

561+
def _domain_rename_duplicates(self, domain):
562+
"""Check for duplicate variable names in domain. If any, rename
563+
the variables, by replacing them with new ones (names are
564+
appended a number). """
565+
attrs, cvars, metas = [], [], []
566+
n_attrs, n_cvars, n_metas = (len(domain.attributes),
567+
len(domain.class_vars), len(domain.metas))
568+
lists = [attrs] * n_attrs + [cvars] * n_cvars + [metas] * n_metas
569+
570+
variables = domain.variables + domain.metas
571+
proposed_names = [m.name for m in variables]
572+
unique_names = get_unique_names_duplicates(proposed_names)
573+
duplicates = set()
574+
for p_name, u_name, var, c in zip(proposed_names, unique_names,
575+
variables, lists):
576+
if p_name != u_name:
577+
duplicates.add(p_name)
578+
var = var.copy(name=u_name)
579+
c.append(var)
580+
if duplicates:
581+
self.Warning.renamed_vars(", ".join(duplicates))
582+
return Orange.data.Domain(attrs, cvars, metas)
583+
569584
@staticmethod
570585
def _join_array_by_indices(left, right, lefti, righti, string_cols=None):
571586
"""Join (horizontally) two arrays, taking pairs of rows given in indices
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
id state continent
22
d s d
3+
34
1 UK Europe
45
2 Russia Europe
56
3 Mexico America

Orange/widgets/data/tests/test_owmergedata.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,22 @@ def test_invalide_pairs(self):
937937
self.assertFalse(widget.Error.matching_index_with_sth.is_shown())
938938
self.assertTrue(widget.Error.matching_numeric_with_nonnum.is_shown())
939939

940+
def test_duplicate_names(self):
941+
domain = Domain([ContinuousVariable("C1")],
942+
metas=[DiscreteVariable("Feature", values=["A", "B"])])
943+
data = Table(domain, np.array([[1.], [0.]]),
944+
metas=np.array([[1.], [0.]]))
945+
domain = Domain([ContinuousVariable("C1")],
946+
metas=[StringVariable("Feature")])
947+
extra_data = Table(domain, np.array([[1.], [0.]]),
948+
metas=np.array([["A"], ["B"]]))
949+
self.send_signal(self.widget.Inputs.data, data)
950+
self.send_signal(self.widget.Inputs.extra_data, extra_data)
951+
self.assertTrue(self.widget.Warning.renamed_vars.is_shown())
952+
merged_data = self.get_output(self.widget.Outputs.data)
953+
self.assertListEqual([m.name for m in merged_data.domain.metas],
954+
["Feature (1)", "Feature (2)"])
955+
940956

941957
if __name__ == "__main__":
942958
unittest.main()

0 commit comments

Comments
 (0)