cleaning up data api file cache (#737)

emanuel-schmid · web-flow · commit c15fc53c34da · 2023-06-19T19:29:56.000+02:00
* api_client.Client: introduce purge_cache method that deletes obsolete files from ~/climada/data

* api_client.purge_cache: simplication

* util.api_client: improved readabliity

* doc: Client.purge_cache

* doc.api_client: cosmetics
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,7 +23,8 @@ Removed:
 - `climada.util.coordinates.match_centroids` method for matching (hazard) centroids to GeoDataFrames [#602](https://github.com/CLIMADA-project/climada_python/pull/602)
 - 'Extra' requirements `doc`, `test`, and `dev` for Python package [#712](https://github.com/CLIMADA-project/climada_python/pull/712)
 - Added method `Exposures.centroids_total_value` to replace the functionality of `Exposures.affected_total_value`. This method is temporary and deprecated. [#702](https://github.com/CLIMADA-project/climada_python/pull/702)
-
+- New method `climada.util.api_client.Client.purge_cache`: utility function to remove outdated files from the local file system to free disk space. 
+  ([#737](https://github.com/CLIMADA-project/climada_python/pull/737))
 
 ### Changed
 
diff --git a/climada/test/test_api_client.py b/climada/test/test_api_client.py
@@ -19,8 +19,8 @@
 Test save module.
 """
 from pathlib import Path
+import tempfile
 import unittest
-from shutil import rmtree
 
 import numpy as np
 
@@ -233,6 +233,46 @@ def test_multiplicity_split(self):
         self.assertEqual(straight, {'b': '1'})
         self.assertEqual(multi, {'country_name': ['x', 'y', 'z']})
 
+    def test_purge_cache(self):
+        client = Client()
+        
+        active_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v2")
+        outdated_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v1")
+        test_ds = client.get_dataset_info(data_type="storm_europe", name="test_storm_europe_icon_2021012800", version="v1", status="test_dataset")
+        expired_ds = client.get_dataset_info(data_type="tropical_cyclone", name="rename_files2", version="v1", status="expired")
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            for ds in [active_ds, outdated_ds, test_ds, expired_ds]:
+                client.download_dataset(dataset=ds, target_dir=Path(temp_dir))
+            self.assertEqual(  # outdated dataset present
+                1,
+                len(list(Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').iterdir()))
+            )
+            self.assertEqual(  # expired data set present
+                1,
+                len(list(Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').iterdir()))
+            )
+
+            client.purge_cache(target_dir=temp_dir)
+            self.assertFalse(  # outdated data set removed
+                Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').is_dir()
+            )
+            self.assertFalse(  # expired data set removed
+                Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').is_dir()
+            )
+            self.assertEqual(  # test files are still there
+                3,
+                len(list(Path(temp_dir).joinpath('hazard/storm_europe/test_storm_europe_icon_2021012800/v1').iterdir()))
+            )
+
+            client.purge_cache(target_dir=temp_dir, keep_testfiles=False)
+            self.assertTrue(  # uptodate active dataset file still there
+                Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v2/LitPop_150arcsec_ABW.hdf5').exists()
+            )
+            self.assertFalse(  # test data removed, empty directories removed
+                Path(temp_dir).joinpath('hazard/').exists()
+            )
+
 
 def rm_empty_dir(folder):
     for subfolder in folder.iterdir():
diff --git a/climada/util/api_client.py b/climada/util/api_client.py
@@ -23,6 +23,7 @@
 import hashlib
 import json
 import logging
+from os.path import commonprefix
 from pathlib import Path
 from urllib.parse import quote, unquote, urlsplit, urlunsplit
 import time
@@ -594,14 +595,20 @@ def _download_file(self, local_path, fileinfo, check=checksize, retries=3):
                 local_path /= fileinfo.file_name
             downloaded = self._tracked_download(remote_url=fileinfo.url, local_path=local_path)
             if not downloaded.enddownload:
-                raise Download.Failed("Download seems to be in progress, please try again later"
-                                      " or remove cache entry by calling"
-                                      f" `Client.purge_cache(Path('{local_path}'))`!")
+                raise Download.Failed(f"A download of {fileinfo.url} via the API Client has been"
+                                      " requested before. Either it is still in progress or the"
+                                      " process got interrupted. In the former case just wait"
+                                      " until the download has finished and try again, in the"
+                                      f" latter run `Client.purge_cache_db(Path('{local_path}'))`"
+                                      " from Python. If unsure, check your internet connection,"
+                                      " wait for as long as it takes to download a file of size"
+                                      f" {fileinfo.file_size} and try again. If the problem"
+                                      " persists, purge the cache db with said call.")
             try:
                 check(local_path, fileinfo)
             except Download.Failed as dlf:
                 local_path.unlink(missing_ok=True)
-                self.purge_cache(local_path)
+                self.purge_cache_db(local_path)
                 raise dlf
             return local_path
         except Download.Failed as dle:
@@ -663,7 +670,7 @@ def _organize_path(dataset, target_dir):
         return target_dir
 
     @staticmethod
-    def purge_cache(local_path):
+    def purge_cache_db(local_path):
         """Removes entry from the sqlite database that keeps track of files downloaded by
         `cached_download`. This may be necessary in case a previous attempt has failed
         in an uncontroled way (power outage or the like).
@@ -1009,3 +1016,57 @@ def into_files_df(dataset_infos):
         """
         return Client.into_datasets_df(dataset_infos) \
             .merge(pd.DataFrame([dsfile for ds in dataset_infos for dsfile in ds.files]))
+
+    def purge_cache(self, target_dir=SYSTEM_DIR, keep_testfiles=True):
+        """Removes downloaded dataset files from the given directory if they have been downloaded
+        with the API client, if they are beneath the given directory and if one of the following
+        is the case:
+        - there status is neither 'active' nor 'test_dataset'
+        - their status is 'test_dataset' and keep_testfiles is set to False
+        - their status is 'active' and they are outdated, i.e., there is a dataset with the same
+          data_type and name but a newer version.
+
+        Parameters
+        ----------
+        target_dir : Path or str, optional
+            files downloaded beneath this directory and empty subdirectories will be removed.
+            default: SYSTEM_DIR
+        keep_testfiles : bool, optional
+            if set to True, files from datasets with status 'test_dataset' will not be removed.
+            default: True
+        """
+
+        # collect urls from datasets that should not be removed
+        test_datasets = self.list_dataset_infos(status='test_dataset') if keep_testfiles else []
+        test_urls = set(
+            file_info.url for ds_info in test_datasets for file_info in ds_info.files)
+
+        active_datasets = self.list_dataset_infos(status='active', version='newest')
+        active_urls = set(
+            file_info.url for ds_info in active_datasets for file_info in ds_info.files)
+
+        not_to_be_removed = test_urls.union(active_urls)
+
+        # make a list of downloaded files that could be removed
+        to_be_removed = [d for d in Download.select() if d.url not in not_to_be_removed]
+
+        # helper function for filtering by target_dir
+        target_dir = Path(target_dir).absolute()
+
+        # remove files and sqlite db entries
+        for obsolete in to_be_removed:
+            opath = Path(obsolete.path)
+            if opath.exists() and Path(commonprefix([target_dir, opath])) == target_dir:
+                opath.unlink()
+                obsolete.delete_instance()
+
+        # clean up: remove all empty directories beneath target_dir
+        def rm_empty_dirs(directory: Path):
+            for subdir in directory.iterdir():
+                if subdir.is_dir():
+                    rm_empty_dirs(subdir)
+            try:
+                directory.rmdir()
+            except OSError:  # raised when the directory is not empty
+                pass
+        rm_empty_dirs(target_dir)
diff --git a/doc/tutorial/climada_util_api_client.ipynb b/doc/tutorial/climada_util_api_client.ipynb
@@ -1204,6 +1204,18 @@
     "ds_files[0], ds_files[0].is_file()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Local File Cache\n",
+    "\n",
+    "By default, the API Client downloads files into the `~/climada/data` directory.\n",
+    "\n",
+    "In the course of time obsolete files may be accumulated within this directory, because there is a newer version of these files available from the [CLIMADA data API](https://climada.ethz.ch), or because the according dataset got expired altogether.\\\n",
+    "To prevent file rot and free disk space, it's possible to remove all outdated files at once, by simply calling `Client().purge_cache()`. This will remove all files that were ever downloaded with the `api_client.Client` and for which a newer version exists, even when the newer version has not been downloaded yet."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {