Merge pull request #32 from atomic-data-sciences/enhancement/better_file_downloading

munrojm · web-flow · commit 5cd32f06cf1a · 2025-05-27T17:11:27.000-04:00
Add better download convenience method
diff --git a/examples/general_use.ipynb b/examples/general_use.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "46b4432e-a8a9-4952-903c-f2f5dd8331c8",
    "metadata": {},
    "outputs": [],
@@ -935,6 +935,67 @@
    "source": [
     "For more information on other data from the API or other example use see notebooks in the code repository (https://github.com/atomic-data-sciences/api-client) and the documentation (https://atomic-data-sciences.github.io/api-client/)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4291e2cb-0f26-48aa-b8e9-b8b7e528fcc5",
+   "metadata": {},
+   "source": [
+    "### Download Processed Videos"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1af857ea-5e3c-4843-9eb4-2a15f0379fbb",
+   "metadata": {},
+   "source": [
+    "Individual processed videos can be downloaded as MP4 files obtained using data IDs from a search, or from the web interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5d665f0f-89f7-4e75-8894-f7095fb4570b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_ids = results[\"Data ID\"].to_list()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "810936a2-d007-4eb2-8ee2-43962ba6b01c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3878a999008544188c5a787708efd064",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "client.download_videos(data_ids=data_ids, dest_dir=\"./\")"
+   ]
   }
  ],
  "metadata": {
diff --git a/src/atomicds/client.py b/src/atomicds/client.py
@@ -474,3 +474,102 @@ def __upload_chunk(
                     future.result()  # raise early if anything went wrong
                     if main_task is not None:
                         progress.update(main_task, advance=1, refresh=True)
+
+    def download_videos(
+        self,
+        data_ids: str | list[str],
+        dest_dir: str | Path | None = None,
+    ):
+        """
+        Download processed RHEED videos to disk.
+
+        Args:
+            data_ids (str | list[str]): One or more data IDs from the data catalogue.
+            dest_dir (str | Path | None): Directory to write the files to.
+                Defaults to the current working directory.
+        """
+        chunk_size: int = 20 * 1024 * 1024  # 20 MiB read chunks
+
+        # Normalise inputs
+        if isinstance(data_ids, str):
+            data_ids = [data_ids]
+        if dest_dir is None:
+            dest_dir = Path.cwd()
+        else:
+            dest_dir = Path(dest_dir).expanduser().resolve()
+        dest_dir.mkdir(parents=True, exist_ok=True)
+
+        def __download_one(data_id: str) -> None:
+            # 1) Resolve the presigned URL -------------------------------------
+            meta: dict = self._get(  # type: ignore  # noqa: PGH003
+                sub_url=f"data_entries/processed_data/{data_id}",
+                params={"return_as": "url-download"},
+            )
+            if meta is None:
+                raise ClientError(f"No processed data found for data_id '{data_id}'")
+
+            url = meta["url"]
+            file_name = (
+                meta.get("file_name") or f"{data_id}.{meta.get('file_format', 'mp4')}"
+            )
+            target = dest_dir / file_name  # type: ignore # noqa: PGH003
+
+            # 2) Open the stream *once* (HEAD not allowed)
+            with self._session.get(  # type: ignore  # noqa: PGH003
+                url, stream=True, allow_redirects=True, timeout=30
+            ) as resp:
+                resp.raise_for_status()
+
+                # Attempt to read the size from **this** GET response
+                total_size = int(resp.headers.get("Content-Length", 0))
+
+                # 3) Create a nested bar for this file
+                if total_size:  # we know the size → percent bar
+                    bar_id = progress.add_task(
+                        f"[red]{file_name}",
+                        total=total_size,
+                        show_percent=True,
+                        show_total=False,
+                        show_spinner=False,
+                        pad="",
+                    )
+                else:  # unknown size → indeterminate spinner
+                    bar_id = progress.add_task(
+                        f"[red]{file_name}",
+                        total=None,
+                        show_percent=False,
+                        show_total=False,
+                        show_spinner=True,
+                        pad="",
+                    )
+
+                # 4) Stream the bytes to disk with updates
+                with Path.open(target, "wb") as fh:
+                    for chunk in resp.iter_content(chunk_size):
+                        if chunk:  # filter out keep-alive
+                            fh.write(chunk)
+                            progress.update(bar_id, advance=len(chunk))
+
+        # Download files
+        with _make_progress(self.mute_bars, False) as progress:
+            # master bar
+            master_task = None
+            if not progress.disable:
+                master_task = progress.add_task(
+                    "Downloading videos…",
+                    total=len(data_ids),
+                    show_percent=False,
+                    show_total=True,
+                    show_spinner=True,
+                    pad="",
+                )
+
+            # thread-pool for concurrent downloads
+            max_workers = min(8, len(data_ids))
+            with ThreadPoolExecutor(max_workers=max_workers) as pool:
+                futures = {pool.submit(__download_one, did): did for did in data_ids}
+                for fut in as_completed(futures):
+                    # propagate any exceptions early
+                    fut.result()
+                    if master_task is not None:
+                        progress.update(master_task, advance=1, refresh=True)