Skip to content

Commit f965ba7

Browse files
committed
add zenodo download helpers to io.py
1 parent dd53baa commit f965ba7

File tree

2 files changed

+84
-84
lines changed

2 files changed

+84
-84
lines changed

notebooks/data-acquisition.ipynb

Lines changed: 9 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,17 @@
3939
},
4040
"outputs": [],
4141
"source": [
42-
"import tempfile\n",
43-
"from io import BytesIO\n",
44-
"from os import environ\n",
4542
"from pathlib import Path\n",
46-
"from zipfile import ZipFile\n",
4743
"\n",
48-
"import numpy as np\n",
4944
"import pandas as pd\n",
5045
"import requests\n",
5146
"from cartopy.io import shapereader\n",
52-
"from fsspec import FSTimeoutError\n",
53-
"from fsspec.implementations.zip import ZipFileSystem\n",
47+
"from pyCIAM.io import (\n",
48+
" download_and_extract_from_zenodo,\n",
49+
" download_and_extract_partial_zip,\n",
50+
" get_zenodo_file_list,\n",
51+
")\n",
52+
"from pyCIAM.utils import copy\n",
5453
"from shared import (\n",
5554
" DIR_SHP,\n",
5655
" DIR_SLR_AR5_IFILES_RAW,\n",
@@ -71,9 +70,7 @@
7170
" PATH_SLR_HIST_TREND_MAP,\n",
7271
" PATHS_SURGE_LOOKUP,\n",
7372
" save,\n",
74-
")\n",
75-
"\n",
76-
"from pyCIAM.utils import copy"
73+
")"
7774
]
7875
},
7976
{
@@ -123,76 +120,6 @@
123120
"Z_URL_SLIIDERS_PC = Z_URL_RECORDS"
124121
]
125122
},
126-
{
127-
"cell_type": "code",
128-
"execution_count": 49,
129-
"id": "8d519f83-eb91-4cb0-b2e7-5918b91d5143",
130-
"metadata": {
131-
"tags": []
132-
},
133-
"outputs": [],
134-
"source": [
135-
"def get_download_link(files, prefix):\n",
136-
" links = [\n",
137-
" i[\"links\"]\n",
138-
" for i in files\n",
139-
" if i.get(\"filename\", \"\").startswith(prefix)\n",
140-
" or i.get(\"key\", \"\").startswith(prefix)\n",
141-
" ]\n",
142-
" assert len(links) == 1\n",
143-
" links = links[0]\n",
144-
" return links.get(\"download\", links[\"self\"])\n",
145-
"\n",
146-
"\n",
147-
"def download_and_extract_full_zip(lpath, url):\n",
148-
" if lpath.exists():\n",
149-
" return None\n",
150-
" lpath.parent.mkdir(exist_ok=True, parents=True)\n",
151-
"\n",
152-
" content = BytesIO(requests.get(url, params=PARAMS).content)\n",
153-
" if isinstance(lpath, Path):\n",
154-
" with ZipFile(content, \"r\") as zip_ref:\n",
155-
" zip_ref.extractall(lpath)\n",
156-
" else:\n",
157-
" with tempfile.TemporaryDirectory() as tmpdir:\n",
158-
" with ZipFile(content, \"r\") as zip_ref:\n",
159-
" zip_ref.extractall(tmpdir)\n",
160-
" copy(Path(tmpdir), lpath)\n",
161-
"\n",
162-
"\n",
163-
"def download_and_extract_partial_zip(lpath, url, zip_glob, n_retries=5):\n",
164-
" lpath.mkdir(exist_ok=True, parents=True)\n",
165-
" z = ZipFileSystem(url)\n",
166-
" if isinstance(zip_glob, (list, set, tuple, np.ndarray)):\n",
167-
" files_remote = zip_glob\n",
168-
" else:\n",
169-
" files_remote = [p for p in z.glob(zip_glob) if not p.endswith(\"/\")]\n",
170-
" files_local = [lpath / Path(f).name for f in files_remote]\n",
171-
" for fr, fl in list(zip(files_remote, files_local)):\n",
172-
" if not fl.is_file():\n",
173-
" retries = 0\n",
174-
" while retries < n_retries:\n",
175-
" print(f\"...Downloading {fl.name} (attempt {retries+1}/{n_retries})\")\n",
176-
" try:\n",
177-
" data = z.cat_file(fr)\n",
178-
" break\n",
179-
" except FSTimeoutError:\n",
180-
" if retries < (n_retries - 1):\n",
181-
" retries += 1\n",
182-
" else:\n",
183-
" raise\n",
184-
" print(f\"...Writing {fl.name}\")\n",
185-
" fl.write_bytes(data)\n",
186-
"\n",
187-
"\n",
188-
"def download_and_extract_from_zenodo(lpath, files, prefix, zip_glob=None):\n",
189-
" dl = get_download_link(files, prefix)\n",
190-
" if zip_glob is None:\n",
191-
" return download_and_extract_full_zip(lpath, dl)\n",
192-
" else:\n",
193-
" return download_and_extract_partial_zip(lpath, dl, zip_glob)"
194-
]
195-
},
196123
{
197124
"cell_type": "code",
198125
"execution_count": 5,
@@ -202,9 +129,7 @@
202129
},
203130
"outputs": [],
204131
"source": [
205-
"pyciam_files = requests.get(\n",
206-
" Z_URL_SLIIDERS_PC.format(doi=Z_PYCIAM_DOI), params=PARAMS\n",
207-
").json()[\"files\"]"
132+
"pyciam_files = get_zenodo_file_list(Z_PYCIAM_DOI)"
208133
]
209134
},
210135
{
@@ -628,7 +553,7 @@
628553
"name": "python",
629554
"nbconvert_exporter": "python",
630555
"pygments_lexer": "ipython3",
631-
"version": "3.10.8"
556+
"version": "3.12.2"
632557
},
633558
"widgets": {
634559
"application/vnd.jupyter.widget-state+json": {

pyCIAM/io.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,22 @@
77
load_diaz_inputs
88
"""
99

10+
import tempfile
1011
from collections.abc import Iterable
12+
from io import BytesIO
13+
from pathlib import Path
14+
from zipfile import ZipFile
1115

1216
import dask.array as da
1317
import numpy as np
1418
import pandas as pd
1519
import pint_xarray # noqa: F401
20+
import requests
1621
import xarray as xr
22+
from fsspec import FSTimeoutError
23+
from fsspec.implementations.zip import ZipFileSystem
1724

25+
from pyCIAM.utils import copy
1826
from pyCIAM.utils import spherical_nearest_neighbor as snn
1927

2028
from .utils import _s2d
@@ -784,3 +792,70 @@ def load_diaz_inputs(
784792

785793
inputs = inputs.drop_dims("rcp_pt")
786794
return inputs, slr
795+
796+
797+
def get_zenodo_file_list(doi, params={}):
798+
return requests.get(f"https://zenodo.org/api/records/{doi}", params=params).json()[
799+
"files"
800+
]
801+
802+
803+
def get_download_link(files, prefix):
804+
links = [
805+
i["links"]
806+
for i in files
807+
if i.get("filename", "").startswith(prefix)
808+
or i.get("key", "").startswith(prefix)
809+
]
810+
assert len(links) == 1
811+
links = links[0]
812+
return links.get("download", links["self"])
813+
814+
815+
def _download_and_extract_full_zip(lpath, url, params={}):
816+
if lpath.exists():
817+
return None
818+
lpath.parent.mkdir(exist_ok=True, parents=True)
819+
820+
content = BytesIO(requests.get(url, params=params).content)
821+
if isinstance(lpath, Path):
822+
with ZipFile(content, "r") as zip_ref:
823+
zip_ref.extractall(lpath)
824+
else:
825+
with tempfile.TemporaryDirectory() as tmpdir:
826+
with ZipFile(content, "r") as zip_ref:
827+
zip_ref.extractall(tmpdir)
828+
copy(Path(tmpdir), lpath)
829+
830+
831+
def download_and_extract_partial_zip(lpath, url, zip_glob, n_retries=5):
832+
lpath.mkdir(exist_ok=True, parents=True)
833+
z = ZipFileSystem(url)
834+
if isinstance(zip_glob, (list, set, tuple, np.ndarray)):
835+
files_remote = zip_glob
836+
else:
837+
files_remote = [p for p in z.glob(zip_glob) if not p.endswith("/")]
838+
files_local = [lpath / Path(f).name for f in files_remote]
839+
for fr, fl in list(zip(files_remote, files_local)):
840+
if not fl.is_file():
841+
retries = 0
842+
while retries < n_retries:
843+
print(f"...Downloading {fl.name} (attempt {retries+1}/{n_retries})")
844+
try:
845+
data = z.cat_file(fr)
846+
break
847+
except FSTimeoutError:
848+
if retries < (n_retries - 1):
849+
retries += 1
850+
else:
851+
raise
852+
print(f"...Writing {fl.name}")
853+
fl.write_bytes(data)
854+
855+
856+
def download_and_extract_from_zenodo(lpath, files, prefix, zip_glob=None):
857+
dl = get_download_link(files, prefix)
858+
if zip_glob is None:
859+
return _download_and_extract_full_zip(lpath, dl)
860+
else:
861+
return download_and_extract_partial_zip(lpath, dl, zip_glob)

0 commit comments

Comments
 (0)