Skip to content

Commit 58c4218

Browse files
authored
Merge pull request #276 from openml/data_qualities
Data qualities
2 parents 500e68c + 33a34e1 commit 58c4218

File tree

4 files changed

+114
-6
lines changed

4 files changed

+114
-6
lines changed

openml/datasets/dataset.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
3939
row_id_attribute=None, ignore_attribute=None,
4040
version_label=None, citation=None, tag=None, visibility=None,
4141
original_data_url=None, paper_url=None, update_comment=None,
42-
md5_checksum=None, data_file=None, features=None):
42+
md5_checksum=None, data_file=None, features=None, qualities=None):
4343
# Attributes received by querying the RESTful API
4444
self.dataset_id = int(dataset_id) if dataset_id is not None else None
4545
self.name = name
@@ -74,6 +74,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
7474
self.md5_cheksum = md5_checksum
7575
self.data_file = data_file
7676
self.features = None
77+
self.qualities = None
7778

7879
if features is not None:
7980
self.features = {}
@@ -87,6 +88,12 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
8788
raise ValueError('Data features not provided in right order')
8889
self.features[feature.index] = feature
8990

91+
if qualities is not None:
92+
self.qualities = {}
93+
for idx, xmlquality in enumerate(qualities['oml:quality']):
94+
name = xmlquality['oml:name']
95+
value = xmlquality['oml:value']
96+
self.qualities[name] = value
9097

9198
if data_file is not None:
9299
if self._data_features_supported():

openml/datasets/functions.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def _get_cached_dataset(dataset_id):
7575
description = _get_cached_dataset_description(dataset_id)
7676
arff_file = _get_cached_dataset_arff(dataset_id)
7777
features = _get_cached_dataset_features(dataset_id)
78-
dataset = _create_dataset_from_description(description, features, arff_file)
78+
qualities = _get_cached_dataset_qualities(dataset_id)
79+
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
7980

8081
return dataset
8182

@@ -107,6 +108,19 @@ def _get_cached_dataset_features(dataset_id):
107108
"cached" % dataset_id)
108109

109110

111+
def _get_cached_dataset_qualities(dataset_id):
112+
cache_dir = config.get_cache_directory()
113+
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
114+
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
115+
try:
116+
with io.open(qualities_file, encoding='utf8') as fh:
117+
qualities_xml = fh.read()
118+
return xmltodict.parse(qualities_xml)["oml:data_qualities"]
119+
except (IOError, OSError):
120+
raise OpenMLCacheException("Dataset qualities for dataset id %d not "
121+
"cached" % dataset_id)
122+
123+
110124
def _get_cached_dataset_arff(dataset_id):
111125
cache_dir = config.get_cache_directory()
112126
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
@@ -272,7 +286,7 @@ def get_dataset(dataset_id):
272286
_remove_dataset_cache_dir(did_cache_dir)
273287
raise e
274288

275-
dataset = _create_dataset_from_description(description, features, arff_file)
289+
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
276290
return dataset
277291

278292

@@ -470,7 +484,7 @@ def _remove_dataset_cache_dir(did_cache_dir):
470484
'Please do this manually!' % did_cache_dir)
471485

472486

473-
def _create_dataset_from_description(description, features, arff_file):
487+
def _create_dataset_from_description(description, features, qualities, arff_file):
474488
"""Create a dataset object from a description dict.
475489
476490
Parameters
@@ -510,5 +524,6 @@ def _create_dataset_from_description(description, features, arff_file):
510524
description.get("oml:update_comment"),
511525
description.get("oml:md5_checksum"),
512526
data_file=arff_file,
513-
features=features)
527+
features=features,
528+
qualities=qualities)
514529
return dataset
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<oml:data_qualities xmlns:oml="http://openml.org/openml">
3+
<oml:quality>
4+
<oml:name>DefaultAccuracy</oml:name>
5+
<oml:value>0.5</oml:value>
6+
</oml:quality>
7+
<oml:quality>
8+
<oml:name>Dimensionality</oml:name>
9+
<oml:value>33.335</oml:value>
10+
</oml:quality>
11+
<oml:quality>
12+
<oml:name>MajorityClassPercentage</oml:name>
13+
<oml:value>50.0</oml:value>
14+
</oml:quality>
15+
<oml:quality>
16+
<oml:name>MajorityClassSize</oml:name>
17+
<oml:value>300.0</oml:value>
18+
</oml:quality>
19+
<oml:quality>
20+
<oml:name>MinorityClassPerentage</oml:name>
21+
<oml:value>50.0</oml:value>
22+
</oml:quality>
23+
<oml:quality>
24+
<oml:name>MinorityClassSize</oml:name>
25+
<oml:value>300.0</oml:value>
26+
</oml:quality>
27+
<oml:quality>
28+
<oml:name>NumberOfBinaryFeatures</oml:name>
29+
<oml:value>1.0</oml:value>
30+
</oml:quality>
31+
<oml:quality>
32+
<oml:name>NumberOfClasses</oml:name>
33+
<oml:value>2.0</oml:value>
34+
</oml:quality>
35+
<oml:quality>
36+
<oml:name>NumberOfFeatures</oml:name>
37+
<oml:value>20001.0</oml:value>
38+
</oml:quality>
39+
<oml:quality>
40+
<oml:name>NumberOfInstances</oml:name>
41+
<oml:value>600.0</oml:value>
42+
</oml:quality>
43+
<oml:quality>
44+
<oml:name>NumberOfInstancesWithMissingValues</oml:name>
45+
<oml:value>0.0</oml:value>
46+
</oml:quality>
47+
<oml:quality>
48+
<oml:name>NumberOfMissingValues</oml:name>
49+
<oml:value>0.0</oml:value>
50+
</oml:quality>
51+
<oml:quality>
52+
<oml:name>NumberOfNumericFeatures</oml:name>
53+
<oml:value>20000.0</oml:value>
54+
</oml:quality>
55+
<oml:quality>
56+
<oml:name>NumberOfSymbolicFeatures</oml:name>
57+
<oml:value>1.0</oml:value>
58+
</oml:quality>
59+
<oml:quality>
60+
<oml:name>PercentageOfBinaryFeatures</oml:name>
61+
<oml:value>0.004999750012499375</oml:value>
62+
</oml:quality>
63+
<oml:quality>
64+
<oml:name>PercentageOfInstancesWithMissingValues</oml:name>
65+
<oml:value>0.0</oml:value>
66+
</oml:quality>
67+
<oml:quality>
68+
<oml:name>PercentageOfMissingValues</oml:name>
69+
<oml:value>0.0</oml:value>
70+
</oml:quality>
71+
<oml:quality>
72+
<oml:name>PercentageOfNumericFeatures</oml:name>
73+
<oml:value>99.9950002499875</oml:value>
74+
</oml:quality>
75+
<oml:quality>
76+
<oml:name>PercentageOfSymbolicFeatures</oml:name>
77+
<oml:value>0.004999750012499375</oml:value>
78+
</oml:quality>
79+
</oml:data_qualities>
80+

tests/test_datasets/test_dataset_functions.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from openml.datasets.functions import (_get_cached_dataset,
2020
_get_cached_dataset_features,
21+
_get_cached_dataset_qualities,
2122
_get_cached_datasets,
2223
_get_dataset_description,
2324
_get_dataset_arff,
@@ -63,11 +64,13 @@ def test__get_cached_dataset(self, ):
6364
openml.config.set_cache_directory(self.static_cache_dir)
6465
dataset = _get_cached_dataset(2)
6566
features = _get_cached_dataset_features(2)
67+
qualities = _get_cached_dataset_qualities(2)
6668
self.assertIsInstance(dataset, OpenMLDataset)
6769
self.assertTrue(len(dataset.features) > 0)
6870
self.assertTrue(len(dataset.features) == len(features['oml:feature']))
71+
self.assertTrue(len(dataset.qualities) == len(qualities['oml:quality']))
6972

70-
def test_get_chached_dataset_description(self):
73+
def test_get_cached_dataset_description(self):
7174
openml.config.set_cache_directory(self.static_cache_dir)
7275
description = openml.datasets.functions._get_cached_dataset_description(2)
7376
self.assertIsInstance(description, dict)
@@ -169,6 +172,9 @@ def test_get_dataset(self):
169172
self.assertTrue(os.path.exists(os.path.join(
170173
openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
171174

175+
self.assertGreater(len(dataset.features), 1)
176+
self.assertGreater(len(dataset.qualities), 4)
177+
172178
def test_get_dataset_with_string(self):
173179
dataset = openml.datasets.get_dataset(101)
174180
self.assertRaises(PyOpenMLError, dataset._get_arff, 'arff')

0 commit comments

Comments
 (0)