Skip to content

Commit c15fc53

Browse files
cleaning up data api file cache (#737)
* api_client.Client: introduce purge_cache method that deletes obsolete files from ~/climada/data * api_client.purge_cache: simplication * util.api_client: improved readabliity * doc: Client.purge_cache * doc.api_client: cosmetics
1 parent 654f9cb commit c15fc53

File tree

4 files changed

+121
-7
lines changed

4 files changed

+121
-7
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ Removed:
2323
- `climada.util.coordinates.match_centroids` method for matching (hazard) centroids to GeoDataFrames [#602](https://github.com/CLIMADA-project/climada_python/pull/602)
2424
- 'Extra' requirements `doc`, `test`, and `dev` for Python package [#712](https://github.com/CLIMADA-project/climada_python/pull/712)
2525
- Added method `Exposures.centroids_total_value` to replace the functionality of `Exposures.affected_total_value`. This method is temporary and deprecated. [#702](https://github.com/CLIMADA-project/climada_python/pull/702)
26-
26+
- New method `climada.util.api_client.Client.purge_cache`: utility function to remove outdated files from the local file system to free disk space.
27+
([#737](https://github.com/CLIMADA-project/climada_python/pull/737))
2728

2829
### Changed
2930

climada/test/test_api_client.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
Test save module.
2020
"""
2121
from pathlib import Path
22+
import tempfile
2223
import unittest
23-
from shutil import rmtree
2424

2525
import numpy as np
2626

@@ -233,6 +233,46 @@ def test_multiplicity_split(self):
233233
self.assertEqual(straight, {'b': '1'})
234234
self.assertEqual(multi, {'country_name': ['x', 'y', 'z']})
235235

236+
def test_purge_cache(self):
237+
client = Client()
238+
239+
active_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v2")
240+
outdated_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v1")
241+
test_ds = client.get_dataset_info(data_type="storm_europe", name="test_storm_europe_icon_2021012800", version="v1", status="test_dataset")
242+
expired_ds = client.get_dataset_info(data_type="tropical_cyclone", name="rename_files2", version="v1", status="expired")
243+
244+
with tempfile.TemporaryDirectory() as temp_dir:
245+
for ds in [active_ds, outdated_ds, test_ds, expired_ds]:
246+
client.download_dataset(dataset=ds, target_dir=Path(temp_dir))
247+
self.assertEqual( # outdated dataset present
248+
1,
249+
len(list(Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').iterdir()))
250+
)
251+
self.assertEqual( # expired data set present
252+
1,
253+
len(list(Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').iterdir()))
254+
)
255+
256+
client.purge_cache(target_dir=temp_dir)
257+
self.assertFalse( # outdated data set removed
258+
Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').is_dir()
259+
)
260+
self.assertFalse( # expired data set removed
261+
Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').is_dir()
262+
)
263+
self.assertEqual( # test files are still there
264+
3,
265+
len(list(Path(temp_dir).joinpath('hazard/storm_europe/test_storm_europe_icon_2021012800/v1').iterdir()))
266+
)
267+
268+
client.purge_cache(target_dir=temp_dir, keep_testfiles=False)
269+
self.assertTrue( # uptodate active dataset file still there
270+
Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v2/LitPop_150arcsec_ABW.hdf5').exists()
271+
)
272+
self.assertFalse( # test data removed, empty directories removed
273+
Path(temp_dir).joinpath('hazard/').exists()
274+
)
275+
236276

237277
def rm_empty_dir(folder):
238278
for subfolder in folder.iterdir():

climada/util/api_client.py

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import hashlib
2424
import json
2525
import logging
26+
from os.path import commonprefix
2627
from pathlib import Path
2728
from urllib.parse import quote, unquote, urlsplit, urlunsplit
2829
import time
@@ -594,14 +595,20 @@ def _download_file(self, local_path, fileinfo, check=checksize, retries=3):
594595
local_path /= fileinfo.file_name
595596
downloaded = self._tracked_download(remote_url=fileinfo.url, local_path=local_path)
596597
if not downloaded.enddownload:
597-
raise Download.Failed("Download seems to be in progress, please try again later"
598-
" or remove cache entry by calling"
599-
f" `Client.purge_cache(Path('{local_path}'))`!")
598+
raise Download.Failed(f"A download of {fileinfo.url} via the API Client has been"
599+
" requested before. Either it is still in progress or the"
600+
" process got interrupted. In the former case just wait"
601+
" until the download has finished and try again, in the"
602+
f" latter run `Client.purge_cache_db(Path('{local_path}'))`"
603+
" from Python. If unsure, check your internet connection,"
604+
" wait for as long as it takes to download a file of size"
605+
f" {fileinfo.file_size} and try again. If the problem"
606+
" persists, purge the cache db with said call.")
600607
try:
601608
check(local_path, fileinfo)
602609
except Download.Failed as dlf:
603610
local_path.unlink(missing_ok=True)
604-
self.purge_cache(local_path)
611+
self.purge_cache_db(local_path)
605612
raise dlf
606613
return local_path
607614
except Download.Failed as dle:
@@ -663,7 +670,7 @@ def _organize_path(dataset, target_dir):
663670
return target_dir
664671

665672
@staticmethod
666-
def purge_cache(local_path):
673+
def purge_cache_db(local_path):
667674
"""Removes entry from the sqlite database that keeps track of files downloaded by
668675
`cached_download`. This may be necessary in case a previous attempt has failed
669676
in an uncontroled way (power outage or the like).
@@ -1009,3 +1016,57 @@ def into_files_df(dataset_infos):
10091016
"""
10101017
return Client.into_datasets_df(dataset_infos) \
10111018
.merge(pd.DataFrame([dsfile for ds in dataset_infos for dsfile in ds.files]))
1019+
1020+
def purge_cache(self, target_dir=SYSTEM_DIR, keep_testfiles=True):
1021+
"""Removes downloaded dataset files from the given directory if they have been downloaded
1022+
with the API client, if they are beneath the given directory and if one of the following
1023+
is the case:
1024+
- there status is neither 'active' nor 'test_dataset'
1025+
- their status is 'test_dataset' and keep_testfiles is set to False
1026+
- their status is 'active' and they are outdated, i.e., there is a dataset with the same
1027+
data_type and name but a newer version.
1028+
1029+
Parameters
1030+
----------
1031+
target_dir : Path or str, optional
1032+
files downloaded beneath this directory and empty subdirectories will be removed.
1033+
default: SYSTEM_DIR
1034+
keep_testfiles : bool, optional
1035+
if set to True, files from datasets with status 'test_dataset' will not be removed.
1036+
default: True
1037+
"""
1038+
1039+
# collect urls from datasets that should not be removed
1040+
test_datasets = self.list_dataset_infos(status='test_dataset') if keep_testfiles else []
1041+
test_urls = set(
1042+
file_info.url for ds_info in test_datasets for file_info in ds_info.files)
1043+
1044+
active_datasets = self.list_dataset_infos(status='active', version='newest')
1045+
active_urls = set(
1046+
file_info.url for ds_info in active_datasets for file_info in ds_info.files)
1047+
1048+
not_to_be_removed = test_urls.union(active_urls)
1049+
1050+
# make a list of downloaded files that could be removed
1051+
to_be_removed = [d for d in Download.select() if d.url not in not_to_be_removed]
1052+
1053+
# helper function for filtering by target_dir
1054+
target_dir = Path(target_dir).absolute()
1055+
1056+
# remove files and sqlite db entries
1057+
for obsolete in to_be_removed:
1058+
opath = Path(obsolete.path)
1059+
if opath.exists() and Path(commonprefix([target_dir, opath])) == target_dir:
1060+
opath.unlink()
1061+
obsolete.delete_instance()
1062+
1063+
# clean up: remove all empty directories beneath target_dir
1064+
def rm_empty_dirs(directory: Path):
1065+
for subdir in directory.iterdir():
1066+
if subdir.is_dir():
1067+
rm_empty_dirs(subdir)
1068+
try:
1069+
directory.rmdir()
1070+
except OSError: # raised when the directory is not empty
1071+
pass
1072+
rm_empty_dirs(target_dir)

doc/tutorial/climada_util_api_client.ipynb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,18 @@
12041204
"ds_files[0], ds_files[0].is_file()"
12051205
]
12061206
},
1207+
{
1208+
"cell_type": "markdown",
1209+
"metadata": {},
1210+
"source": [
1211+
"#### Local File Cache\n",
1212+
"\n",
1213+
"By default, the API Client downloads files into the `~/climada/data` directory.\n",
1214+
"\n",
1215+
"In the course of time obsolete files may be accumulated within this directory, because there is a newer version of these files available from the [CLIMADA data API](https://climada.ethz.ch), or because the according dataset got expired altogether.\\\n",
1216+
"To prevent file rot and free disk space, it's possible to remove all outdated files at once, by simply calling `Client().purge_cache()`. This will remove all files that were ever downloaded with the `api_client.Client` and for which a newer version exists, even when the newer version has not been downloaded yet."
1217+
]
1218+
},
12071219
{
12081220
"cell_type": "markdown",
12091221
"metadata": {

0 commit comments

Comments
 (0)