Skip to content

Commit 87ad7b1

Browse files
authored
Have different cache directories for different servers (#432)
* Have different directories for different servers * simplify based on jans comments * fix attribute access * Change variable name * Take into account Jans suggestions * Harmonize import, fix rebase errors * re-add accidentaly removed files * First try at a solution * Removing faulty fix * Fix for bug in unit test, method _get_cached_task * Removing FileNotFoundError as it does not exist in python2 * Fixing test_tagging * Changing id according to new solution * Change _remove_dataset_cache_dir to the new implementation
1 parent c626bde commit 87ad7b1

35 files changed

+240
-233
lines changed

doc/usage.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ API:
5555
.. code:: python
5656
5757
>>> import os
58-
>>> openml.config.set_cache_directory(os.path.expanduser('~/.openml/cache'))
58+
>>> openml.config.cache_directory = os.path.expanduser('~/.openml/cache')
5959
6060
Config file:
6161

openml/config.py

Lines changed: 30 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,31 @@
66

77
from six import StringIO
88
from six.moves import configparser
9+
from six.moves.urllib_parse import urlparse
910

1011

1112
logger = logging.getLogger(__name__)
1213
logging.basicConfig(
1314
format='[%(levelname)s] [%(asctime)s:%(name)s] %('
1415
'message)s', datefmt='%H:%M:%S')
1516

17+
# Default values!
18+
_defaults = {
19+
'apikey': None,
20+
'server': "https://www.openml.org/api/v1/xml",
21+
'verbosity': 0,
22+
'cachedir': os.path.expanduser('~/.openml/cache'),
23+
'avoid_duplicate_runs': 'True',
24+
}
25+
1626
config_file = os.path.expanduser('~/.openml/config')
17-
server = "https://www.openml.org/api/v1/xml"
27+
28+
# Default values are actually added here in the _setup() function which is
29+
# called at the end of this module
30+
server = ""
1831
apikey = ""
19-
cachedir = ""
32+
# The current cache directory (without the server name)
33+
cache_directory = ""
2034

2135

2236
def _setup():
@@ -26,12 +40,11 @@ def _setup():
2640
key and server can be set by the user simply using
2741
openml.config.apikey = THEIRKEY
2842
openml.config.server = SOMESERVER
29-
The cache dir needs to be set up calling set_cache_directory
30-
because it needs some setup.
3143
We could also make it a property but that's less clear.
3244
"""
3345
global apikey
3446
global server
47+
global cache_directory
3548
global avoid_duplicate_runs
3649
# read config file, create cache directory
3750
try:
@@ -42,52 +55,15 @@ def _setup():
4255
config = _parse_config()
4356
apikey = config.get('FAKE_SECTION', 'apikey')
4457
server = config.get('FAKE_SECTION', 'server')
45-
cache_dir = config.get('FAKE_SECTION', 'cachedir')
58+
cache_directory = config.get('FAKE_SECTION', 'cachedir')
4659
avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
47-
set_cache_directory(cache_dir)
48-
49-
50-
def set_cache_directory(cachedir):
51-
"""Set module-wide cache directory.
52-
53-
Sets the cache directory into which to download datasets, tasks etc.
54-
55-
Parameters
56-
----------
57-
cachedir : string
58-
Path to use as cache directory.
59-
60-
See also
61-
--------
62-
get_cache_directory
63-
"""
64-
65-
global _cachedir
66-
_cachedir = cachedir
67-
68-
# Set up the cache directories
69-
dataset_cache_dir = os.path.join(cachedir, "datasets")
70-
task_cache_dir = os.path.join(cachedir, "tasks")
71-
run_cache_dir = os.path.join(cachedir, 'runs')
72-
lock_dir = os.path.join(cachedir, 'locks')
73-
74-
for dir_ in [
75-
cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir, lock_dir,
76-
]:
77-
if not os.path.exists(dir_) and not os.path.isdir(dir_):
78-
os.mkdir(dir_)
7960

8061

8162
def _parse_config():
8263
"""Parse the config file, set up defaults.
8364
"""
84-
defaults = {'apikey': apikey,
85-
'server': server,
86-
'verbosity': 0,
87-
'cachedir': os.path.expanduser('~/.openml/cache'),
88-
'avoid_duplicate_runs': 'True'}
8965

90-
config = configparser.RawConfigParser(defaults=defaults)
66+
config = configparser.RawConfigParser(defaults=_defaults)
9167

9268
if not os.path.exists(config_file):
9369
# Create an empty config file if there was none so far
@@ -106,8 +82,7 @@ def _parse_config():
10682
config_file_.seek(0)
10783
config.readfp(config_file_)
10884
except OSError as e:
109-
logging.info("Error opening file %s: %s" %
110-
config_file, e.message)
85+
logging.info("Error opening file %s: %s", config_file, e.message)
11186
return config
11287

11388

@@ -119,13 +94,19 @@ def get_cache_directory():
11994
cachedir : string
12095
The current cache directory.
12196
122-
See also
123-
--------
124-
set_cache_directory
12597
"""
98+
url_suffix = urlparse(server).netloc
99+
reversed_url_suffix = '/'.join(url_suffix.split('.')[::-1])
100+
if not cache_directory:
101+
_cachedir = _defaults(cache_directory)
102+
else:
103+
_cachedir = cache_directory
104+
_cachedir = os.path.join(_cachedir, reversed_url_suffix)
126105
return _cachedir
127106

128107

129-
__all__ = ["set_cache_directory", 'get_cache_directory']
108+
__all__ = [
109+
'get_cache_directory',
110+
]
130111

131112
_setup()

openml/datasets/functions.py

Lines changed: 30 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,22 @@
1414
from .dataset import OpenMLDataset
1515
from ..exceptions import OpenMLCacheException, OpenMLServerException, \
1616
OpenMLHashException, PrivateDatasetError
17-
from .. import config
18-
from .._api_calls import _read_url
17+
from ..utils import (
18+
_create_cache_directory,
19+
_remove_cache_dir_for_id,
20+
_create_cache_directory_for_id,
21+
_create_lockfiles_dir,
22+
)
23+
24+
25+
DATASETS_CACHE_DIR_NAME = 'datasets'
26+
1927

2028

2129
############################################################################
2230
# Local getters/accessors to the cache directory
2331

32+
2433
def _list_cached_datasets():
2534
"""Return list with ids of all cached datasets
2635
@@ -31,8 +40,7 @@ def _list_cached_datasets():
3140
"""
3241
datasets = []
3342

34-
dataset_cache = config.get_cache_directory()
35-
dataset_cache_dir = os.path.join(dataset_cache, "datasets")
43+
dataset_cache_dir = _create_cache_directory(DATASETS_CACHE_DIR_NAME)
3644
directory_content = os.listdir(dataset_cache_dir)
3745
directory_content.sort()
3846

@@ -88,8 +96,9 @@ def _get_cached_dataset(dataset_id):
8896

8997

9098
def _get_cached_dataset_description(dataset_id):
91-
cache_dir = config.get_cache_directory()
92-
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
99+
did_cache_dir = _create_cache_directory_for_id(
100+
DATASETS_CACHE_DIR_NAME, dataset_id,
101+
)
93102
description_file = os.path.join(did_cache_dir, "description.xml")
94103
try:
95104
with io.open(description_file, encoding='utf8') as fh:
@@ -102,8 +111,9 @@ def _get_cached_dataset_description(dataset_id):
102111

103112

104113
def _get_cached_dataset_features(dataset_id):
105-
cache_dir = config.get_cache_directory()
106-
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
114+
did_cache_dir = _create_cache_directory_for_id(
115+
DATASETS_CACHE_DIR_NAME, dataset_id,
116+
)
107117
features_file = os.path.join(did_cache_dir, "features.xml")
108118
try:
109119
with io.open(features_file, encoding='utf8') as fh:
@@ -115,8 +125,9 @@ def _get_cached_dataset_features(dataset_id):
115125

116126

117127
def _get_cached_dataset_qualities(dataset_id):
118-
cache_dir = config.get_cache_directory()
119-
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
128+
did_cache_dir = _create_cache_directory_for_id(
129+
DATASETS_CACHE_DIR_NAME, dataset_id,
130+
)
120131
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
121132
try:
122133
with io.open(qualities_file, encoding='utf8') as fh:
@@ -128,8 +139,9 @@ def _get_cached_dataset_qualities(dataset_id):
128139

129140

130141
def _get_cached_dataset_arff(dataset_id):
131-
cache_dir = config.get_cache_directory()
132-
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
142+
did_cache_dir = _create_cache_directory_for_id(
143+
DATASETS_CACHE_DIR_NAME, dataset_id,
144+
)
133145
output_file = os.path.join(did_cache_dir, "dataset.arff")
134146

135147
try:
@@ -311,9 +323,11 @@ def get_dataset(dataset_id):
311323

312324
with lockutils.external_lock(
313325
name='datasets.functions.get_dataset:%d' % dataset_id,
314-
lock_path=os.path.join(config.get_cache_directory(), 'locks'),
326+
lock_path=_create_lockfiles_dir(),
315327
):
316-
did_cache_dir = _create_dataset_cache_directory(dataset_id)
328+
did_cache_dir = _create_cache_directory_for_id(
329+
DATASETS_CACHE_DIR_NAME, dataset_id,
330+
)
317331

318332
try:
319333
remove_dataset_cache = True
@@ -330,7 +344,7 @@ def get_dataset(dataset_id):
330344
raise e
331345
finally:
332346
if remove_dataset_cache:
333-
_remove_dataset_cache_dir(did_cache_dir)
347+
_remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
334348

335349
dataset = _create_dataset_from_description(
336350
description, features, qualities, arff_file
@@ -412,7 +426,7 @@ def _get_dataset_arff(did_cache_dir, description):
412426
pass
413427

414428
url = description['oml:url']
415-
arff_string = _read_url(url)
429+
arff_string = openml._api_calls._read_url(url)
416430
md5 = hashlib.md5()
417431
md5.update(arff_string.encode('utf-8'))
418432
md5_checksum = md5.hexdigest()
@@ -505,55 +519,6 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
505519
return qualities
506520

507521

508-
def _create_dataset_cache_directory(dataset_id):
509-
"""Create a dataset cache directory
510-
511-
In order to have a clearer cache structure and because every dataset
512-
is cached in several files (description, arff, features, qualities), there
513-
is a directory for each dataset witch the dataset ID being the directory
514-
name. This function creates this cache directory.
515-
516-
This function is NOT thread/multiprocessing safe.
517-
518-
Parameters
519-
----------
520-
did : int
521-
Dataset ID
522-
523-
Returns
524-
-------
525-
str
526-
Path of the created dataset cache directory.
527-
"""
528-
dataset_cache_dir = os.path.join(
529-
config.get_cache_directory(),
530-
"datasets",
531-
str(dataset_id),
532-
)
533-
if os.path.exists(dataset_cache_dir) and os.path.isdir(dataset_cache_dir):
534-
pass
535-
elif os.path.exists(dataset_cache_dir) and not os.path.isdir(dataset_cache_dir):
536-
raise ValueError('Dataset cache dir exists but is not a directory!')
537-
else:
538-
os.makedirs(dataset_cache_dir)
539-
return dataset_cache_dir
540-
541-
542-
def _remove_dataset_cache_dir(did_cache_dir):
543-
"""Remove the dataset cache directory
544-
545-
This function is NOT thread/multiprocessing safe.
546-
547-
Parameters
548-
----------
549-
"""
550-
try:
551-
shutil.rmtree(did_cache_dir)
552-
except (OSError, IOError):
553-
raise ValueError('Cannot remove faulty dataset cache directory %s.'
554-
'Please do this manually!' % did_cache_dir)
555-
556-
557522
def _create_dataset_from_description(description, features, qualities, arff_file):
558523
"""Create a dataset object from a description dict.
559524

openml/runs/functions.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import io
33
import json
44
import os
5+
import shutil
56
import sys
67
import time
78
import warnings
@@ -28,6 +29,8 @@
2829
# _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
2930
# circular imports
3031

32+
RUNS_CACHE_DIR_NAME = 'runs'
33+
3134

3235
def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None,
3336
seed=None):
@@ -643,7 +646,7 @@ def get_run(run_id):
643646
run : OpenMLRun
644647
Run corresponding to ID, fetched from the server.
645648
"""
646-
run_dir = os.path.join(config.get_cache_directory(), "runs", str(run_id))
649+
run_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)
647650
run_file = os.path.join(run_dir, "description.xml")
648651

649652
if not os.path.exists(run_dir):
@@ -878,8 +881,9 @@ def _create_trace_from_arff(arff_obj):
878881

879882
def _get_cached_run(run_id):
880883
"""Load a run from the cache."""
881-
cache_dir = config.get_cache_directory()
882-
run_cache_dir = os.path.join(cache_dir, "runs", str(run_id))
884+
run_cache_dir = openml.utils._create_cache_directory_for_id(
885+
RUNS_CACHE_DIR_NAME, run_id,
886+
)
883887
try:
884888
run_file = os.path.join(run_cache_dir, "description.xml")
885889
with io.open(run_file, encoding='utf8') as fh:

openml/runs/run.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import openml
1111
import openml._api_calls
1212
from ..tasks import get_task
13-
from .._api_calls import _file_id_to_url
1413
from ..exceptions import PyOpenMLError
1514

1615

@@ -142,7 +141,9 @@ def get_metric_fn(self, sklearn_fn, kwargs={}):
142141
if self.data_content is not None and self.task_id is not None:
143142
predictions_arff = self._generate_arff_dict()
144143
elif 'predictions' in self.output_files:
145-
predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff')
144+
predictions_file_url = openml._api_calls._file_id_to_url(
145+
self.output_files['predictions'], 'predictions.arff',
146+
)
146147
predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
147148
# TODO: make this a stream reader
148149
else:

0 commit comments

Comments
 (0)