diff --git a/astroquery/__init__.py b/astroquery/__init__.py index 5513075fc9..0c11541b2e 100644 --- a/astroquery/__init__.py +++ b/astroquery/__init__.py @@ -16,6 +16,7 @@ import logging from .logger import _init_log +from astropy import config as _config __all__ = ["__version__", "__githash__", "__citation__", "__bibtex__", "test", "log"] @@ -38,3 +39,23 @@ def _get_bibtex(): logging.addLevelName(5, "TRACE") log = logging.getLogger() log = _init_log() + + +# Set up cache configuration +class Cache_Conf(_config.ConfigNamespace): + + cache_timeout = _config.ConfigItem( + 604800, + ('Astroquery-wide cache timeout (seconds). Default is 1 week (604800). ' + 'Setting to None prevents the cache from expiring (not recommended).'), + cfgtype='integer' + ) + + cache_active = _config.ConfigItem( + True, + "Astroquery global cache usage, False turns off all caching.", + cfgtype='boolean' + ) + + +cache_conf = Cache_Conf() diff --git a/astroquery/query.py b/astroquery/query.py index ddc3dc4fa9..998d1a1cdd 100644 --- a/astroquery/query.py +++ b/astroquery/query.py @@ -11,26 +11,31 @@ import requests import textwrap +from datetime import datetime, timedelta +from pathlib import Path + from astropy.config import paths -from astroquery import log import astropy.units as u from astropy.utils.console import ProgressBarOrSpinner import astropy.utils.data +from astropy.utils import deprecated + +from astroquery import version, log, cache_conf +from astroquery.utils import system_tools -from . import version -from .utils import system_tools __all__ = ['BaseQuery', 'QueryWithLogin'] def to_cache(response, cache_file): log.debug("Caching data to {0}".format(cache_file)) + response = copy.deepcopy(response) if hasattr(response, 'request'): for key in tuple(response.request.hooks.keys()): del response.request.hooks[key] with open(cache_file, "wb") as f: - pickle.dump(response, f) + pickle.dump(response, f, protocol=4) def _replace_none_iterable(iterable): @@ -102,20 +107,30 @@ def hash(self): return self._hash def request_file(self, cache_location): - fn = os.path.join(cache_location, self.hash() + ".pickle") + fn = cache_location.joinpath(self.hash() + ".pickle") return fn - def from_cache(self, cache_location): + def from_cache(self, cache_location, cache_timeout): request_file = self.request_file(cache_location) try: - with open(request_file, "rb") as f: - response = pickle.load(f) - if not isinstance(response, requests.Response): + if cache_timeout is None: + expired = False + else: + current_time = datetime.utcnow() + cache_time = datetime.utcfromtimestamp(request_file.stat().st_mtime) + expired = current_time-cache_time > timedelta(seconds=cache_timeout) + if not expired: + with open(request_file, "rb") as f: + response = pickle.load(f) + if not isinstance(response, requests.Response): + response = None + else: + log.debug(f"Cache expired for {request_file}...") response = None except FileNotFoundError: response = None if response: - log.debug("Retrieving data from {0}".format(request_file)) + log.debug("Retrieved data from {0}".format(request_file)) return response def remove_cache_file(self, cache_location): @@ -125,8 +140,8 @@ def remove_cache_file(self, cache_location): """ request_file = self.request_file(cache_location) - if os.path.exists(request_file): - os.remove(request_file) + if request_file.exists: + request_file.unlink() else: raise FileNotFoundError(f"Tried to remove cache file {request_file} but " "it does not exist") @@ -173,11 +188,8 @@ def __init__(self): .format(vers=version.version, olduseragent=S.headers['User-Agent'])) - self.cache_location = os.path.join( - paths.get_cache_dir(), 'astroquery', - self.__class__.__name__.split("Class")[0]) - os.makedirs(self.cache_location, exist_ok=True) - self._cache_active = True + self.name = self.__class__.__name__.split("Class")[0] + self._cache_location = None def __call__(self, *args, **kwargs): """ init a fresh copy of self """ @@ -217,9 +229,28 @@ def _response_hook(self, response, *args, **kwargs): f"-----------------------------------------", '\t') log.log(5, f"HTTP response\n{response_log}") + @property + def cache_location(self): + cl = self._cache_location or Path(paths.get_cache_dir(), 'astroquery', self.name) + cl.mkdir(parents=True, exist_ok=True) + return cl + + @cache_location.setter + def cache_location(self, loc): + self._cache_location = Path(loc) + + def reset_cache_location(self): + """Resets the cache location to the default astropy cache""" + self._cache_location = None + + def clear_cache(self): + """Removes all cache files.""" + for fle in self.cache_location.glob("*.pickle"): + fle.unlink() + def _request(self, method, url, params=None, data=None, headers=None, - files=None, save=False, savedir='', timeout=None, cache=True, + files=None, save=False, savedir='', timeout=None, cache=None, stream=False, auth=None, continuation=True, verify=True, allow_redirects=True, json=None, return_response_on_save=False): @@ -253,6 +284,7 @@ def _request(self, method, url, somewhere other than `BaseQuery.cache_location` timeout : int cache : bool + Optional, if specified, overrides global cache settings. verify : bool Verify the server's TLS certificate? (see http://docs.python-requests.org/en/master/_modules/requests/sessions/?highlight=verify) @@ -278,12 +310,16 @@ def _request(self, method, url, is True. """ + if cache is None: # Global caching not overridden + cache = cache_conf.cache_active + if save: local_filename = url.split('/')[-1] if os.name == 'nt': # Windows doesn't allow special characters in filenames like # ":" so replace them with an underscore local_filename = local_filename.replace(':', '_') + local_filepath = os.path.join(savedir or self.cache_location or '.', local_filename) response = self._download_file(url, local_filepath, cache=cache, timeout=timeout, @@ -298,14 +334,14 @@ def _request(self, method, url, else: query = AstroQuery(method, url, params=params, data=data, headers=headers, files=files, timeout=timeout, json=json) - if ((self.cache_location is None) or (not self._cache_active) or (not cache)): - with suspend_cache(self): + if not cache: + with cache_conf.set_temp("cache_active", False): response = query.request(self._session, stream=stream, auth=auth, verify=verify, allow_redirects=allow_redirects, json=json) else: - response = query.from_cache(self.cache_location) + response = query.from_cache(self.cache_location, cache_conf.cache_timeout) if not response: response = query.request(self._session, self.cache_location, @@ -315,6 +351,7 @@ def _request(self, method, url, verify=verify, json=json) to_cache(response, query.request_file(self.cache_location)) + self._last_query = query return response @@ -336,6 +373,7 @@ def _download_file(self, url, local_filepath, timeout=None, auth=None, supports HTTP "range" requests, the download will be continued where it left off. cache : bool + Cache downloaded file. Defaults to False. method : "GET" or "POST" head_safe : bool """ @@ -439,19 +477,21 @@ def _download_file(self, url, local_filepath, timeout=None, auth=None, return response +@deprecated(since="v0.4.7", message=("The suspend_cache function is deprecated," + "Use the conf set_temp function instead.")) class suspend_cache: """ A context manager that suspends caching. """ - def __init__(self, obj): - self.obj = obj + def __init__(self, obj=None): + self.original_cache_setting = cache_conf.cache_active def __enter__(self): - self.obj._cache_active = False + cache_conf.cache_active = False def __exit__(self, exc_type, exc_value, traceback): - self.obj._cache_active = True + cache_conf.cache_active = self.original_cache_setting return False @@ -507,7 +547,7 @@ def _login(self, *args, **kwargs): pass def login(self, *args, **kwargs): - with suspend_cache(self): + with cache_conf.set_temp("cache_active", False): self._authenticated = self._login(*args, **kwargs) return self._authenticated diff --git a/astroquery/tests/test_cache.py b/astroquery/tests/test_cache.py new file mode 100644 index 0000000000..d04d7782b9 --- /dev/null +++ b/astroquery/tests/test_cache.py @@ -0,0 +1,225 @@ +import requests +import os + +from time import mktime +from datetime import datetime + +from astropy.config import paths + +from astroquery.query import QueryWithLogin +from astroquery import cache_conf + +URL1 = "http://fakeurl.edu" +URL2 = "http://fakeurl.ac.uk" + +TEXT1 = "Penguin" +TEXT2 = "Walrus" + + +def set_response(resp_text, resp_status=200): + """Function that allows us to set a specific mock response for cache testing""" + + def get_mockreturn(url, *args, **kwargs): + """Generate a mock return to a requests call""" + + myresp = requests.Response() + myresp._content = resp_text + myresp.request = requests.PreparedRequest() + myresp.status_code = resp_status + + return myresp + + requests.Session.request = get_mockreturn + + +class CacheTestClass(QueryWithLogin): + """Bare bones class for testing caching""" + + def test_func(self, requrl): + + return self._request(method="GET", url=requrl) + + def _login(self, username): + + return self._request(method="GET", url=username).content == "Penguin" + + +def test_conf(): + cache_conf.reset() + + default_timeout = cache_conf.cache_timeout + default_active = cache_conf.cache_active + + assert default_timeout == 604800 + assert default_active is True + + with cache_conf.set_temp("cache_timeout", 5): + assert cache_conf.cache_timeout == 5 + + with cache_conf.set_temp("cache_active", False): + assert cache_conf.cache_active is False + + assert cache_conf.cache_timeout == default_timeout + assert cache_conf.cache_active == default_active + + cache_conf.cache_timeout = 5 + cache_conf.cache_active = False + cache_conf.reset() + + assert cache_conf.cache_timeout == default_timeout + assert cache_conf.cache_active == default_active + + +def test_basic_caching(): + cache_conf.reset() + + mytest = CacheTestClass() + assert cache_conf.cache_active + + mytest.clear_cache() + assert len(os.listdir(mytest.cache_location)) == 0 + + set_response(TEXT1) + + resp = mytest.test_func(URL1) + assert resp.content == TEXT1 + assert len(os.listdir(mytest.cache_location)) == 1 + + set_response(TEXT2) + + resp = mytest.test_func(URL2) # query that has not been cached + assert resp.content == TEXT2 + assert len(os.listdir(mytest.cache_location)) == 2 + + resp = mytest.test_func(URL1) + assert resp.content == TEXT1 # query that was cached + assert len(os.listdir(mytest.cache_location)) == 2 # no new cache file + + mytest.clear_cache() + assert len(os.listdir(mytest.cache_location)) == 0 + + resp = mytest.test_func(URL1) + assert resp.content == TEXT2 # Now get new response + + +def test_change_location(tmp_path): + cache_conf.reset() + + mytest = CacheTestClass() + default_cache_location = mytest.cache_location + + assert paths.get_cache_dir() in str(default_cache_location) + assert "astroquery" in mytest.cache_location.parts + assert mytest.name in mytest.cache_location.parts + + new_loc = tmp_path.joinpath("new_dir") + mytest.cache_location = new_loc + assert mytest.cache_location == new_loc + + mytest.reset_cache_location() + assert mytest.cache_location == default_cache_location + + new_loc.mkdir(parents=True, exist_ok=True) + with paths.set_temp_cache(new_loc): + assert str(new_loc) in str(mytest.cache_location) + assert "astroquery" in mytest.cache_location.parts + assert mytest.name in mytest.cache_location.parts + + +def test_login(): + cache_conf.reset() + + mytest = CacheTestClass() + assert cache_conf.cache_active + + mytest.clear_cache() + assert len(os.listdir(mytest.cache_location)) == 0 + + set_response(TEXT1) # Text 1 is set as the approved password + + mytest.login("ceb") + assert mytest.authenticated() + assert len(os.listdir(mytest.cache_location)) == 0 # request should not be cached + + set_response(TEXT2) # Text 2 is not the approved password + + mytest.login("ceb") + assert not mytest.authenticated() # Should not be accessing cache + + +def test_timeout(monkeypatch): + cache_conf.reset() + + mytest = CacheTestClass() + assert cache_conf.cache_active + + mytest.clear_cache() + assert len(os.listdir(mytest.cache_location)) == 0 + + set_response(TEXT1) # setting the response + + resp = mytest.test_func(URL1) # should be cached + assert resp.content == TEXT1 + + set_response(TEXT2) # changing the response + + resp = mytest.test_func(URL1) # should access cached value + assert resp.content == TEXT1 + + # Changing the file date so the cache will consider it expired + cache_file = next(mytest.cache_location.iterdir()) + modTime = mktime(datetime(1970, 1, 1).timetuple()) + os.utime(cache_file, (modTime, modTime)) + + resp = mytest.test_func(URL1) + assert resp.content == TEXT2 # now see the new response + + # Testing a cache timeout of "none" + cache_conf.cache_timeout = None + set_response(TEXT1) + + resp = mytest.test_func(URL1) + assert resp.content == TEXT2 # cache is accessed + + +def test_deactivate(): + cache_conf.reset() + + mytest = CacheTestClass() + cache_conf.cache_active = False + + mytest.clear_cache() + assert len(os.listdir(mytest.cache_location)) == 0 + + set_response(TEXT1) + + resp = mytest.test_func(URL1) + assert resp.content == TEXT1 + assert len(os.listdir(mytest.cache_location)) == 0 + + set_response(TEXT2) + + resp = mytest.test_func(URL1) + assert resp.content == TEXT2 + assert len(os.listdir(mytest.cache_location)) == 0 + + cache_conf.reset() + assert cache_conf.cache_active is True + + with cache_conf.set_temp('cache_active', False): + mytest.clear_cache() + assert len(os.listdir(mytest.cache_location)) == 0 + + set_response(TEXT1) + + resp = mytest.test_func(URL1) + assert resp.content == TEXT1 + assert len(os.listdir(mytest.cache_location)) == 0 + + set_response(TEXT2) + + resp = mytest.test_func(URL1) + assert resp.content == TEXT2 + assert len(os.listdir(mytest.cache_location)) == 0 + + assert cache_conf.cache_active is True diff --git a/docs/index.rst b/docs/index.rst index fefa351d1d..9050e00bec 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -172,6 +172,50 @@ uncomment the relevant configuration item(s), and insert your desired value(s). configuration + +Caching +------- + +By default Astroquery employs query caching with a timeout of 1 week. +The user can clear their cache at any time, as well as suspend cache usage, +and change the cache location. Caching persists between Astroquery sessions. +If you know the service you are using has released new data recently, or if you believe you are +not recieving the newest data, try clearing the cache. + + +The Astroquery cache location is divided by service, so each service's cache should be managed invidually, +however whether the cache is active and the expiration time are controlled centrally through the +astroquery ``cache_conf`` module. Astroquery uses the Astropy configuration infrastructure, information about +temporarily or permanently changing configuration values can be found +`here `_. + +Shown here are the cache properties, using Simbad as an example: + +.. code-block:: python + + >>> from astroquery import cache_conf + >>> from astroquery.simbad import Simbad + ... + >>> # Is the cache active? + >>> print(cache_conf.cache_active) + True + >>> # Cache timout in seconds + >>> print(cache_conf.cache_timeout) + 604800 + >>> # Cache location + >>> print(Simbad.cache_location) # doctest: +IGNORE_OUTPUT + /Users/username/.astropy/cache/astroquery/Simbad + + +To clear the cache: + +.. code-block:: python + + >>> Simbad.clear_cache() + + + + Available Services ==================