Skip to content

Commit 1045e72

Browse files
Merge pull request #4431 from AndrejaKovacic/unique-names
[FIX] Ensure unique var names in file
2 parents 34063e6 + e48a32f commit 1045e72

File tree

6 files changed

+258
-53
lines changed

6 files changed

+258
-53
lines changed

Orange/data/tests/test_util.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,39 @@ def test_get_unique_names_from_duplicates(self):
5959
["x (2)", "x (3)", "x (1)"])
6060
self.assertEqual(
6161
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]),
62-
["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"])
62+
["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"])
63+
self.assertEqual(
64+
get_unique_names_duplicates(["iris", "iris", "iris (1)"]),
65+
["iris (2)", "iris (3)", "iris (1)"])
66+
67+
self.assertEqual(
68+
get_unique_names_duplicates(["foo", "bar", "baz"], return_duplicated=True),
69+
(["foo", "bar", "baz"], []))
70+
self.assertEqual(
71+
get_unique_names_duplicates(["foo", "bar", "baz", "bar"], return_duplicated=True),
72+
(["foo", "bar (1)", "baz", "bar (2)"], ["bar"]))
73+
self.assertEqual(
74+
get_unique_names_duplicates(["x", "x", "x (1)"], return_duplicated=True),
75+
(["x (2)", "x (3)", "x (1)"], ["x"]))
76+
self.assertEqual(
77+
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"], return_duplicated=True),
78+
(["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"], ["x (2)", "x"]))
6379
self.assertEqual(
6480
get_unique_names_duplicates(["x", "", "", None, None, "x"]),
6581
["x (1)", "", "", None, None, "x (2)"])
82+
self.assertEqual(
83+
get_unique_names_duplicates(["iris", "iris", "iris (1)", "iris (2)"], return_duplicated=True),
84+
(["iris (3)", "iris (4)", "iris (1)", "iris (2)"], ["iris"]))
85+
86+
self.assertEqual(
87+
get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)"]),
88+
["iris (1) (1)", "iris (1) (2)", "iris (1) (3)"]
89+
)
90+
91+
self.assertEqual(
92+
get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)", "iris", "iris"]),
93+
["iris (1) (1)", "iris (1) (2)", "iris (1) (3)", "iris (2)", "iris (3)"]
94+
)
6695

6796
def test_get_unique_names_domain(self):
6897
(attrs, classes, metas), renamed = \

Orange/data/util.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Data-manipulation utilities.
33
"""
44
import re
5-
from collections import Counter, defaultdict
6-
from itertools import chain
5+
from collections import Counter
6+
from itertools import chain, count
77
from typing import Callable
88

99
import numpy as np
@@ -155,8 +155,8 @@ def get_indices(names, name):
155155
:param name: str
156156
:return: list of indices
157157
"""
158-
return [int(a.group(2)) for x in names
159-
for a in re.finditer(RE_FIND_INDEX.format(name), x)]
158+
return [int(a.group(2)) for x in filter(None, names)
159+
for a in re.finditer(RE_FIND_INDEX.format(re.escape(name)), x)]
160160

161161

162162
def get_unique_names(names, proposed):
@@ -203,26 +203,22 @@ def get_unique_names(names, proposed):
203203
return [f"{name} ({max_index})" for name in proposed]
204204

205205

206-
def get_unique_names_duplicates(proposed: list) -> list:
206+
def get_unique_names_duplicates(proposed: list, return_duplicated=False) -> list:
207207
"""
208208
Returns list of unique names. If a name is duplicated, the
209-
function appends the smallest available index in parentheses.
209+
function appends the next available index in parentheses.
210210
211211
For example, a proposed list of names `x`, `x` and `x (2)`
212-
results in `x (1)`, `x (3)`, `x (2)`.
212+
results in `x (3)`, `x (4)`, `x (2)`.
213213
"""
214-
counter = Counter(proposed)
215-
index = defaultdict(int)
216-
names = []
217-
for name in proposed:
218-
if name and counter[name] > 1:
219-
unique_name = name
220-
while unique_name in counter:
221-
index[name] += 1
222-
unique_name = f"{name} ({index[name]})"
223-
name = unique_name
224-
names.append(name)
225-
return names
214+
indices = {name: count(max(get_indices(proposed, name), default=0) + 1)
215+
for name, cnt in Counter(proposed).items()
216+
if name and cnt > 1}
217+
new_names = [f"{name} ({next(indices[name])})" if name in indices else name
218+
for name in proposed]
219+
if return_duplicated:
220+
return new_names, list(indices)
221+
return new_names
226222

227223

228224
def get_unique_names_domain(attributes, class_vars=(), metas=()):

Orange/widgets/data/owfile.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from Orange.widgets.utils.filedialogs import RecentPathsWComboMixin, \
2222
open_filename_dialog
2323
from Orange.widgets.utils.widgetpreview import WidgetPreview
24-
from Orange.widgets.widget import Output
24+
from Orange.widgets.widget import Output, Msg
2525

2626
# Backward compatibility: class RecentPath used to be defined in this module,
2727
# and it is used in saved (pickled) settings. It must be imported into the
@@ -121,17 +121,19 @@ class Outputs:
121121
domain_editor = SettingProvider(DomainEditor)
122122

123123
class Warning(widget.OWWidget.Warning):
124-
file_too_big = widget.Msg("The file is too large to load automatically."
125-
" Press Reload to load.")
126-
load_warning = widget.Msg("Read warning:\n{}")
127-
performance_warning = widget.Msg(
124+
file_too_big = Msg("The file is too large to load automatically."
125+
" Press Reload to load.")
126+
load_warning = Msg("Read warning:\n{}")
127+
performance_warning = Msg(
128128
"Categorical variables with >100 values may decrease performance.")
129+
renamed_vars = Msg("Some variables have been renamed "
130+
"to avoid duplicates.\n{}")
129131

130132
class Error(widget.OWWidget.Error):
131-
file_not_found = widget.Msg("File not found.")
132-
missing_reader = widget.Msg("Missing reader.")
133-
sheet_error = widget.Msg("Error listing available sheets.")
134-
unknown = widget.Msg("Read error:\n{}")
133+
file_not_found = Msg("File not found.")
134+
missing_reader = Msg("Missing reader.")
135+
sheet_error = Msg("Error listing available sheets.")
136+
unknown = Msg("Read error:\n{}")
135137

136138
class NoFileSelected:
137139
pass
@@ -478,10 +480,13 @@ def _inspect_discrete_variables(self, domain):
478480

479481
def apply_domain_edit(self):
480482
self.Warning.performance_warning.clear()
483+
self.Warning.renamed_vars.clear()
481484
if self.data is None:
482485
table = None
483486
else:
484-
domain, cols = self.domain_editor.get_domain(self.data.domain, self.data)
487+
domain, cols, renamed = \
488+
self.domain_editor.get_domain(self.data.domain, self.data,
489+
deduplicate=True)
485490
if not (domain.variables or domain.metas):
486491
table = None
487492
elif domain is self.data.domain:
@@ -493,6 +498,8 @@ def apply_domain_edit(self):
493498
table.ids = np.array(self.data.ids)
494499
table.attributes = getattr(self.data, 'attributes', {})
495500
self._inspect_discrete_variables(domain)
501+
if renamed:
502+
self.Warning.renamed_vars(f"Renamed: {', '.join(renamed)}")
496503

497504
self.Outputs.data.send(table)
498505
self.apply_button.setEnabled(False)

Orange/widgets/data/tests/test_owfile.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,22 @@ def test_domain_changes_are_stored(self):
141141
data = self.get_output(self.widget.Outputs.data)
142142
self.assertIsInstance(data.domain["iris"], StringVariable)
143143

144+
def test_rename_duplicates(self):
145+
self.open_dataset("iris")
146+
147+
idx = self.widget.domain_editor.model().createIndex(3, 0)
148+
self.assertFalse(self.widget.Warning.renamed_vars.is_shown())
149+
self.widget.domain_editor.model().setData(idx, "iris", Qt.EditRole)
150+
self.widget.apply_button.click()
151+
data = self.get_output(self.widget.Outputs.data)
152+
self.assertIn("iris (1)", data.domain)
153+
self.assertIn("iris (2)", data.domain)
154+
self.assertTrue(self.widget.Warning.renamed_vars.is_shown())
155+
156+
self.widget.domain_editor.model().setData(idx, "different iris", Qt.EditRole)
157+
self.widget.apply_button.click()
158+
self.assertFalse(self.widget.Warning.renamed_vars.is_shown())
159+
144160
def test_variable_name_change(self):
145161
"""
146162
Test whether the name of the variable is changed correctly by
@@ -155,6 +171,12 @@ def test_variable_name_change(self):
155171
data = self.get_output(self.widget.Outputs.data)
156172
self.assertIn("a", data.domain)
157173

174+
idx = self.widget.domain_editor.model().createIndex(3, 0)
175+
self.widget.domain_editor.model().setData(idx, "d", Qt.EditRole)
176+
self.widget.apply_button.click()
177+
data = self.get_output(self.widget.Outputs.data)
178+
self.assertIn("d", data.domain)
179+
158180
# rename and change to text
159181
idx = self.widget.domain_editor.model().createIndex(4, 0)
160182
self.widget.domain_editor.model().setData(idx, "b", Qt.EditRole)
@@ -250,6 +272,13 @@ def test_check_column_noname(self):
250272
self.widget.domain_editor.model().setData(idx, "", Qt.EditRole)
251273
self.assertEqual(self.widget.domain_editor.model().data(idx, Qt.DisplayRole), temp)
252274

275+
def test_invalid_role_mode(self):
276+
self.open_dataset("iris")
277+
model = self.widget.domain_editor.model()
278+
idx = model.createIndex(1, 0)
279+
self.assertFalse(model.setData(idx, Qt.StatusTipRole, ""))
280+
self.assertIsNone(model.data(idx, Qt.StatusTipRole))
281+
253282
def test_context_match_includes_variable_values(self):
254283
file1 = """\
255284
var

Orange/widgets/utils/domaineditor.py

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \
1212
TimeVariable, Domain
13+
from Orange.data.util import get_unique_names_duplicates
1314
from Orange.statistics.util import unique
1415
from Orange.widgets import gui
1516
from Orange.widgets.gui import HorizontalGridDelegate
@@ -61,13 +62,14 @@ def set_variables(self, variables):
6162
def rowCount(self, parent):
6263
return 0 if parent.isValid() else len(self.variables)
6364

64-
def columnCount(self, parent):
65+
@staticmethod
66+
def columnCount(parent):
6567
return 0 if parent.isValid() else Column.not_valid
6668

6769
def data(self, index, role):
6870
row, col = index.row(), index.column()
6971
val = self.variables[row][col]
70-
if role == Qt.DisplayRole or role == Qt.EditRole:
72+
if role in (Qt.DisplayRole, Qt.EditRole):
7173
if col == Column.tpe:
7274
return self.type2name[val]
7375
if col == Column.place:
@@ -90,8 +92,9 @@ def data(self, index, role):
9092
font = QFont()
9193
font.setBold(True)
9294
return font
95+
return None
9396

94-
def setData(self, index, value, role):
97+
def setData(self, index, value, role=Qt.EditRole):
9598
row, col = index.row(), index.column()
9699
row_data = self.variables[row]
97100
if role == Qt.EditRole:
@@ -110,6 +113,7 @@ def setData(self, index, value, role):
110113
# Settings may change background colors
111114
self.dataChanged.emit(index.sibling(row, 0), index.sibling(row, 3))
112115
return True
116+
return False
113117

114118
def headerData(self, i, orientation, role=Qt.DisplayRole):
115119
if orientation == Qt.Horizontal and role == Qt.DisplayRole and i < 4:
@@ -130,7 +134,7 @@ def __init__(self, view, items):
130134
self.view = view
131135
self.items = items
132136

133-
def createEditor(self, parent, option, index):
137+
def createEditor(self, parent, _option, index):
134138
# This ugly hack closes the combo when the user selects an item
135139
class Combo(QComboBox):
136140
def __init__(self, *args):
@@ -145,6 +149,8 @@ def showPopup(self, *args):
145149
super().showPopup(*args)
146150
self.popup_shown = True
147151

152+
# Here, we need `self` from the closure
153+
# pylint: disable=no-self-argument,attribute-defined-outside-init
148154
def hidePopup(me):
149155
if me.popup_shown:
150156
self.view.model().setData(
@@ -250,21 +256,27 @@ def _merge(cols, force_dense=False):
250256
sparse_cols = [c if sp.issparse(c) else sp.csc_matrix(c) for c in cols]
251257
return sp.hstack(sparse_cols).tocsr()
252258

253-
def get_domain(self, domain, data):
254-
"""Create domain (and dataset) from changes made in the widget.
255-
256-
Parameters
257-
----------
258-
domain : old domain
259-
data : source data
259+
def get_domain(self, domain, data, deduplicate=False):
260+
"""
261+
Create domain (and dataset) from changes made in the widget.
260262
261263
Returns
262264
-------
263-
(new_domain, [attribute_columns, class_var_columns, meta_columns])
265+
266+
Args:
267+
domain (Domain): original domain
268+
data (Table): original data
269+
deduplicate (bool): if True, variable names are deduplicated and
270+
the result contains an additional list with names of renamed
271+
variables
272+
273+
Returns:
274+
(new_domain, [attribute_columns, class_var_columns, meta_columns])
275+
or
276+
(new_domain, [attribute_columns, class_var_columns, meta_columns], renamed)
264277
"""
265278
# Allow type-checking with type() instead of isinstance() for exact comparison
266279
# pylint: disable=unidiomatic-typecheck
267-
268280
variables = self.model().variables
269281
places = [[], [], []] # attributes, class_vars, metas
270282
cols = [[], [], []] # Xcols, Ycols, Mcols
@@ -283,8 +295,17 @@ def numbers_are_round(var, col_data):
283295
chain(((at, Place.feature) for at in domain.attributes),
284296
((cl, Place.class_var) for cl in domain.class_vars),
285297
((mt, Place.meta) for mt in domain.metas)))):
286-
return domain, [data.X, data.Y, data.metas]
298+
if deduplicate:
299+
return domain, [data.X, data.Y, data.metas], []
300+
else:
301+
return domain, [data.X, data.Y, data.metas]
287302

303+
relevant_names = [var[0] for var in variables if var[2] != Place.skip]
304+
if deduplicate:
305+
renamed_iter = iter(get_unique_names_duplicates(relevant_names))
306+
else:
307+
renamed_iter = iter(relevant_names)
308+
renamed = []
288309
for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
289310
zip(variables,
290311
chain([(at, Place.feature) for at in domain.attributes],
@@ -293,24 +314,28 @@ def numbers_are_round(var, col_data):
293314
if place == Place.skip:
294315
continue
295316

317+
new_name = next(renamed_iter)
318+
if new_name != name and name not in renamed:
319+
renamed.append(name)
320+
296321
col_data = self._get_column(data, orig_var, orig_plc)
297322
is_sparse = sp.issparse(col_data)
298323

299-
if name == orig_var.name and tpe == type(orig_var):
324+
if new_name == orig_var.name and tpe == type(orig_var):
300325
var = orig_var
301326
elif tpe == type(orig_var):
302-
var = orig_var.copy(name=name)
327+
var = orig_var.copy(name=new_name)
303328
elif tpe == DiscreteVariable:
304329
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
305330
round_numbers = numbers_are_round(orig_var, col_data)
306331
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
307332
for x in self._iter_vals(col_data)]
308333
if round_numbers:
309334
values = [str(int(float(v))) for v in values]
310-
var = tpe(name, values)
335+
var = tpe(new_name, values)
311336
col_data = self._to_column(col_data, is_sparse)
312337
elif tpe == StringVariable:
313-
var = tpe.make(name)
338+
var = tpe.make(new_name)
314339
if type(orig_var) in [DiscreteVariable, TimeVariable]:
315340
col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
316341
for x in self._iter_vals(col_data)]
@@ -324,25 +349,29 @@ def numbers_are_round(var, col_data):
324349
# in metas which are transformed to dense below
325350
col_data = self._to_column(col_data, False, dtype=object)
326351
elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable:
327-
var = tpe.make(name)
352+
var = tpe.make(new_name)
328353
if may_be_numeric:
329354
col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)])
330355
for x in self._iter_vals(col_data)]
331356
col_data = self._to_column(col_data, is_sparse)
332357
else:
333-
var = tpe(name)
358+
var = tpe(new_name)
334359
places[place].append(var)
335360
cols[place].append(col_data)
336361

337362
# merge columns for X, Y and metas
338363
feats = cols[Place.feature]
339-
X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
364+
X = self._merge(feats) if feats else np.empty((len(data), 0))
340365
Y = self._merge(cols[Place.class_var], force_dense=True)
341366
m = self._merge(cols[Place.meta], force_dense=True)
342367
domain = Domain(*places)
343-
return domain, [X, Y, m]
368+
if deduplicate:
369+
return domain, [X, Y, m], renamed
370+
else:
371+
return domain, [X, Y, m]
344372

345-
def _get_column(self, data, source_var, source_place):
373+
@staticmethod
374+
def _get_column(data, source_var, source_place):
346375
""" Extract column from data and preserve sparsity. """
347376
if source_place == Place.meta:
348377
col_data = data[:, source_var].metas

0 commit comments

Comments
 (0)