Skip to content

Commit 525e8a6

Browse files
authored
improve error message for dataset upload (#927)
* improve error message for dataset upload * fix unit test
1 parent 2bfd581 commit 525e8a6

File tree

2 files changed

+33
-9
lines changed

2 files changed

+33
-9
lines changed

openml/datasets/dataset.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,16 +133,40 @@ def __init__(
133133
qualities=None,
134134
dataset=None,
135135
):
136+
def find_invalid_characters(string, pattern):
137+
invalid_chars = set()
138+
regex = re.compile(pattern)
139+
for char in string:
140+
if not regex.match(char):
141+
invalid_chars.add(char)
142+
invalid_chars = ",".join(
143+
[
144+
"'{}'".format(char) if char != "'" else '"{}"'.format(char)
145+
for char in invalid_chars
146+
]
147+
)
148+
return invalid_chars
149+
136150
if dataset_id is None:
137-
if description and not re.match("^[\x00-\x7F]*$", description):
151+
pattern = "^[\x00-\x7F]*$"
152+
if description and not re.match(pattern, description):
138153
# not basiclatin (XSD complains)
139-
raise ValueError("Invalid symbols in description: {}".format(description))
140-
if citation and not re.match("^[\x00-\x7F]*$", citation):
154+
invalid_characters = find_invalid_characters(description, pattern)
155+
raise ValueError(
156+
"Invalid symbols {} in description: {}".format(invalid_characters, description)
157+
)
158+
pattern = "^[\x00-\x7F]*$"
159+
if citation and not re.match(pattern, citation):
141160
# not basiclatin (XSD complains)
142-
raise ValueError("Invalid symbols in citation: {}".format(citation))
143-
if not re.match("^[a-zA-Z0-9_\\-\\.\\(\\),]+$", name):
161+
invalid_characters = find_invalid_characters(citation, pattern)
162+
raise ValueError(
163+
"Invalid symbols {} in citation: {}".format(invalid_characters, citation)
164+
)
165+
pattern = "^[a-zA-Z0-9_\\-\\.\\(\\),]+$"
166+
if not re.match(pattern, name):
144167
# regex given by server in error message
145-
raise ValueError("Invalid symbols in name: {}".format(name))
168+
invalid_characters = find_invalid_characters(name, pattern)
169+
raise ValueError("Invalid symbols {} in name: {}".format(invalid_characters, name))
146170
# TODO add function to check if the name is casual_string128
147171
# Attributes received by querying the RESTful API
148172
self.dataset_id = int(dataset_id) if dataset_id is not None else None

tests/test_datasets/test_dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@ def test_repr(self):
3838
str(data)
3939

4040
def test_init_string_validation(self):
41-
with pytest.raises(ValueError, match="Invalid symbols in name"):
41+
with pytest.raises(ValueError, match="Invalid symbols ' ' in name"):
4242
openml.datasets.OpenMLDataset(name="some name", description="a description")
4343

44-
with pytest.raises(ValueError, match="Invalid symbols in description"):
44+
with pytest.raises(ValueError, match="Invalid symbols 'ï' in description"):
4545
openml.datasets.OpenMLDataset(name="somename", description="a descriptïon")
4646

47-
with pytest.raises(ValueError, match="Invalid symbols in citation"):
47+
with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"):
4848
openml.datasets.OpenMLDataset(
4949
name="somename", description="a description", citation="Something by Müller"
5050
)

0 commit comments

Comments
 (0)