|
23 | 23 | import hashlib |
24 | 24 | import json |
25 | 25 | import logging |
| 26 | +from os.path import commonprefix |
26 | 27 | from pathlib import Path |
27 | 28 | from urllib.parse import quote, unquote, urlsplit, urlunsplit |
28 | 29 | import time |
@@ -594,14 +595,20 @@ def _download_file(self, local_path, fileinfo, check=checksize, retries=3): |
594 | 595 | local_path /= fileinfo.file_name |
595 | 596 | downloaded = self._tracked_download(remote_url=fileinfo.url, local_path=local_path) |
596 | 597 | if not downloaded.enddownload: |
597 | | - raise Download.Failed("Download seems to be in progress, please try again later" |
598 | | - " or remove cache entry by calling" |
599 | | - f" `Client.purge_cache(Path('{local_path}'))`!") |
| 598 | + raise Download.Failed(f"A download of {fileinfo.url} via the API Client has been" |
| 599 | + " requested before. Either it is still in progress or the" |
| 600 | + " process got interrupted. In the former case just wait" |
| 601 | + " until the download has finished and try again, in the" |
| 602 | + f" latter run `Client.purge_cache_db(Path('{local_path}'))`" |
| 603 | + " from Python. If unsure, check your internet connection," |
| 604 | + " wait for as long as it takes to download a file of size" |
| 605 | + f" {fileinfo.file_size} and try again. If the problem" |
| 606 | + " persists, purge the cache db with said call.") |
600 | 607 | try: |
601 | 608 | check(local_path, fileinfo) |
602 | 609 | except Download.Failed as dlf: |
603 | 610 | local_path.unlink(missing_ok=True) |
604 | | - self.purge_cache(local_path) |
| 611 | + self.purge_cache_db(local_path) |
605 | 612 | raise dlf |
606 | 613 | return local_path |
607 | 614 | except Download.Failed as dle: |
@@ -663,7 +670,7 @@ def _organize_path(dataset, target_dir): |
663 | 670 | return target_dir |
664 | 671 |
|
665 | 672 | @staticmethod |
666 | | - def purge_cache(local_path): |
| 673 | + def purge_cache_db(local_path): |
667 | 674 | """Removes entry from the sqlite database that keeps track of files downloaded by |
668 | 675 | `cached_download`. This may be necessary in case a previous attempt has failed |
669 | 676 | in an uncontroled way (power outage or the like). |
@@ -1009,3 +1016,57 @@ def into_files_df(dataset_infos): |
1009 | 1016 | """ |
1010 | 1017 | return Client.into_datasets_df(dataset_infos) \ |
1011 | 1018 | .merge(pd.DataFrame([dsfile for ds in dataset_infos for dsfile in ds.files])) |
| 1019 | + |
| 1020 | + def purge_cache(self, target_dir=SYSTEM_DIR, keep_testfiles=True): |
| 1021 | + """Removes downloaded dataset files from the given directory if they have been downloaded |
| 1022 | + with the API client, if they are beneath the given directory and if one of the following |
| 1023 | + is the case: |
| 1024 | + - there status is neither 'active' nor 'test_dataset' |
| 1025 | + - their status is 'test_dataset' and keep_testfiles is set to False |
| 1026 | + - their status is 'active' and they are outdated, i.e., there is a dataset with the same |
| 1027 | + data_type and name but a newer version. |
| 1028 | +
|
| 1029 | + Parameters |
| 1030 | + ---------- |
| 1031 | + target_dir : Path or str, optional |
| 1032 | + files downloaded beneath this directory and empty subdirectories will be removed. |
| 1033 | + default: SYSTEM_DIR |
| 1034 | + keep_testfiles : bool, optional |
| 1035 | + if set to True, files from datasets with status 'test_dataset' will not be removed. |
| 1036 | + default: True |
| 1037 | + """ |
| 1038 | + |
| 1039 | + # collect urls from datasets that should not be removed |
| 1040 | + test_datasets = self.list_dataset_infos(status='test_dataset') if keep_testfiles else [] |
| 1041 | + test_urls = set( |
| 1042 | + file_info.url for ds_info in test_datasets for file_info in ds_info.files) |
| 1043 | + |
| 1044 | + active_datasets = self.list_dataset_infos(status='active', version='newest') |
| 1045 | + active_urls = set( |
| 1046 | + file_info.url for ds_info in active_datasets for file_info in ds_info.files) |
| 1047 | + |
| 1048 | + not_to_be_removed = test_urls.union(active_urls) |
| 1049 | + |
| 1050 | + # make a list of downloaded files that could be removed |
| 1051 | + to_be_removed = [d for d in Download.select() if d.url not in not_to_be_removed] |
| 1052 | + |
| 1053 | + # helper function for filtering by target_dir |
| 1054 | + target_dir = Path(target_dir).absolute() |
| 1055 | + |
| 1056 | + # remove files and sqlite db entries |
| 1057 | + for obsolete in to_be_removed: |
| 1058 | + opath = Path(obsolete.path) |
| 1059 | + if opath.exists() and Path(commonprefix([target_dir, opath])) == target_dir: |
| 1060 | + opath.unlink() |
| 1061 | + obsolete.delete_instance() |
| 1062 | + |
| 1063 | + # clean up: remove all empty directories beneath target_dir |
| 1064 | + def rm_empty_dirs(directory: Path): |
| 1065 | + for subdir in directory.iterdir(): |
| 1066 | + if subdir.is_dir(): |
| 1067 | + rm_empty_dirs(subdir) |
| 1068 | + try: |
| 1069 | + directory.rmdir() |
| 1070 | + except OSError: # raised when the directory is not empty |
| 1071 | + pass |
| 1072 | + rm_empty_dirs(target_dir) |
0 commit comments