diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 553ec672c..052b5d6d4 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -31,6 +31,7 @@ REQUEST_RETRIES, RETRY_STATUSES, ZARR_DELETE_BATCH_SIZE, + ZARR_EXTENSIONS, DandiInstance, EmbargoStatus, ) @@ -1251,6 +1252,35 @@ def get_asset_by_path(self, path: str) -> RemoteAsset: else: return asset + def get_asset_with_subpath(self, path: str) -> RemoteAsset | ZarrWithPrefix: + def is_zarr_part(part: str) -> bool: + for ext in ZARR_EXTENSIONS: + if part.endswith(ext) and part != ext: + return True + return False + + full_path = PurePosixPath(path) + asset_path = PurePosixPath() + for i, p in enumerate(full_path.parts): + asset_path /= p + if is_zarr_part(p) and i < len(full_path.parts) - 1: + try: + asset = self.get_asset_by_path(str(asset_path)) + except NotFoundError: + pass + else: + if isinstance(asset, RemoteZarrAsset): + return ZarrWithPrefix( + zarr=asset, prefix="/".join(full_path.parts[i + 1 :]) + ) + else: + # We found a blob, which is not a folder, so no Zarr + # path can exist under it. + raise NotFoundError( + f"{path!r} is not a Zarr path with entry prefix" + ) + return self.get_asset_by_path(path) + def download_directory( self, assets_dirpath: str, @@ -1934,6 +1964,12 @@ def set_raw_metadata(self, metadata: dict[str, Any]) -> None: self._metadata = data["metadata"] +@dataclass +class ZarrWithPrefix: + zarr: RemoteZarrAsset + prefix: str + + @dataclass class RemoteZarrEntry: """ diff --git a/dandi/download.py b/dandi/download.py index 3edf495ed..c40c8e52d 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -226,6 +226,7 @@ class Downloader: url: ParsedDandiURL output_dir: InitVar[str | Path] output_prefix: Path = field(init=False) + #: just a convenience combination of output_dir and output_prefix output_path: Path = field(init=False) existing: DownloadExisting get_metadata: bool @@ -333,6 +334,12 @@ def download_generator(self) -> Iterator[dict]: asset.path, ) mtime = asset.modified + if asset.subpath: + lgr.warning( + "No downloading of subpaths within blobs yet. Got %s for %s", + asset.subpath, + asset.path, + ) _download_generator = _download_file( asset.get_download_file_iter(), download_path, @@ -352,7 +359,8 @@ def download_generator(self) -> Iterator[dict]: ), f"Asset {asset.path} is neither blob nor Zarr" _download_generator = _download_zarr( asset, - download_path, + prefix=asset.subpath, + download_path=download_path, toplevel_path=self.output_path, existing=self.existing, jobs=self.jobs_per_zarr, @@ -812,6 +820,7 @@ def _download_file( lgr.warning("downloader logic: We should not be here!") final_digest = None + if downloaded_digest and not resuming: assert downloaded_digest is not None final_digest = downloaded_digest.hexdigest() # we care only about hex @@ -977,6 +986,7 @@ def _download_zarr( toplevel_path: str | Path, existing: DownloadExisting, lock: Lock, + prefix: str | None = None, jobs: int | None = None, ) -> Iterator[dict]: # Avoid heavy import by importing within function: @@ -993,7 +1003,7 @@ def digest_callback(path: str, algoname: str, d: str) -> None: digests[path] = d def downloads_gen(): - for entry in asset.iterfiles(): + for entry in asset.iterfiles(prefix=prefix): entries.append(entry) etag = entry.digest assert etag.algorithm is DigestType.md5