Skip to content

Commit 5cd32f0

Browse files
authored
Merge pull request #32 from atomic-data-sciences/enhancement/better_file_downloading
Add better download convenience method
2 parents 4d2e02e + ba329c1 commit 5cd32f0

File tree

2 files changed

+161
-1
lines changed

2 files changed

+161
-1
lines changed

examples/general_use.ipynb

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
},
3737
{
3838
"cell_type": "code",
39-
"execution_count": 2,
39+
"execution_count": 4,
4040
"id": "46b4432e-a8a9-4952-903c-f2f5dd8331c8",
4141
"metadata": {},
4242
"outputs": [],
@@ -935,6 +935,67 @@
935935
"source": [
936936
"For more information on other data from the API or other example use see notebooks in the code repository (https://github.com/atomic-data-sciences/api-client) and the documentation (https://atomic-data-sciences.github.io/api-client/)"
937937
]
938+
},
939+
{
940+
"cell_type": "markdown",
941+
"id": "4291e2cb-0f26-48aa-b8e9-b8b7e528fcc5",
942+
"metadata": {},
943+
"source": [
944+
"### Download Processed Videos"
945+
]
946+
},
947+
{
948+
"cell_type": "markdown",
949+
"id": "1af857ea-5e3c-4843-9eb4-2a15f0379fbb",
950+
"metadata": {},
951+
"source": [
952+
"Individual processed videos can be downloaded as MP4 files obtained using data IDs from a search, or from the web interface."
953+
]
954+
},
955+
{
956+
"cell_type": "code",
957+
"execution_count": 5,
958+
"id": "5d665f0f-89f7-4e75-8894-f7095fb4570b",
959+
"metadata": {},
960+
"outputs": [],
961+
"source": [
962+
"data_ids = results[\"Data ID\"].to_list()"
963+
]
964+
},
965+
{
966+
"cell_type": "code",
967+
"execution_count": 6,
968+
"id": "810936a2-d007-4eb2-8ee2-43962ba6b01c",
969+
"metadata": {},
970+
"outputs": [
971+
{
972+
"data": {
973+
"application/vnd.jupyter.widget-view+json": {
974+
"model_id": "3878a999008544188c5a787708efd064",
975+
"version_major": 2,
976+
"version_minor": 0
977+
},
978+
"text/plain": [
979+
"Output()"
980+
]
981+
},
982+
"metadata": {},
983+
"output_type": "display_data"
984+
},
985+
{
986+
"data": {
987+
"text/html": [
988+
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
989+
],
990+
"text/plain": []
991+
},
992+
"metadata": {},
993+
"output_type": "display_data"
994+
}
995+
],
996+
"source": [
997+
"client.download_videos(data_ids=data_ids, dest_dir=\"./\")"
998+
]
938999
}
9391000
],
9401001
"metadata": {

src/atomicds/client.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,3 +474,102 @@ def __upload_chunk(
474474
future.result() # raise early if anything went wrong
475475
if main_task is not None:
476476
progress.update(main_task, advance=1, refresh=True)
477+
478+
def download_videos(
479+
self,
480+
data_ids: str | list[str],
481+
dest_dir: str | Path | None = None,
482+
):
483+
"""
484+
Download processed RHEED videos to disk.
485+
486+
Args:
487+
data_ids (str | list[str]): One or more data IDs from the data catalogue.
488+
dest_dir (str | Path | None): Directory to write the files to.
489+
Defaults to the current working directory.
490+
"""
491+
chunk_size: int = 20 * 1024 * 1024 # 20 MiB read chunks
492+
493+
# Normalise inputs
494+
if isinstance(data_ids, str):
495+
data_ids = [data_ids]
496+
if dest_dir is None:
497+
dest_dir = Path.cwd()
498+
else:
499+
dest_dir = Path(dest_dir).expanduser().resolve()
500+
dest_dir.mkdir(parents=True, exist_ok=True)
501+
502+
def __download_one(data_id: str) -> None:
503+
# 1) Resolve the presigned URL -------------------------------------
504+
meta: dict = self._get( # type: ignore # noqa: PGH003
505+
sub_url=f"data_entries/processed_data/{data_id}",
506+
params={"return_as": "url-download"},
507+
)
508+
if meta is None:
509+
raise ClientError(f"No processed data found for data_id '{data_id}'")
510+
511+
url = meta["url"]
512+
file_name = (
513+
meta.get("file_name") or f"{data_id}.{meta.get('file_format', 'mp4')}"
514+
)
515+
target = dest_dir / file_name # type: ignore # noqa: PGH003
516+
517+
# 2) Open the stream *once* (HEAD not allowed)
518+
with self._session.get( # type: ignore # noqa: PGH003
519+
url, stream=True, allow_redirects=True, timeout=30
520+
) as resp:
521+
resp.raise_for_status()
522+
523+
# Attempt to read the size from **this** GET response
524+
total_size = int(resp.headers.get("Content-Length", 0))
525+
526+
# 3) Create a nested bar for this file
527+
if total_size: # we know the size → percent bar
528+
bar_id = progress.add_task(
529+
f"[red]{file_name}",
530+
total=total_size,
531+
show_percent=True,
532+
show_total=False,
533+
show_spinner=False,
534+
pad="",
535+
)
536+
else: # unknown size → indeterminate spinner
537+
bar_id = progress.add_task(
538+
f"[red]{file_name}",
539+
total=None,
540+
show_percent=False,
541+
show_total=False,
542+
show_spinner=True,
543+
pad="",
544+
)
545+
546+
# 4) Stream the bytes to disk with updates
547+
with Path.open(target, "wb") as fh:
548+
for chunk in resp.iter_content(chunk_size):
549+
if chunk: # filter out keep-alive
550+
fh.write(chunk)
551+
progress.update(bar_id, advance=len(chunk))
552+
553+
# Download files
554+
with _make_progress(self.mute_bars, False) as progress:
555+
# master bar
556+
master_task = None
557+
if not progress.disable:
558+
master_task = progress.add_task(
559+
"Downloading videos…",
560+
total=len(data_ids),
561+
show_percent=False,
562+
show_total=True,
563+
show_spinner=True,
564+
pad="",
565+
)
566+
567+
# thread-pool for concurrent downloads
568+
max_workers = min(8, len(data_ids))
569+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
570+
futures = {pool.submit(__download_one, did): did for did in data_ids}
571+
for fut in as_completed(futures):
572+
# propagate any exceptions early
573+
fut.result()
574+
if master_task is not None:
575+
progress.update(master_task, advance=1, refresh=True)

0 commit comments

Comments
 (0)