Skip to content

Commit 4993b6c

Browse files
Merge pull request #166 from NeurodataWithoutBorders/inspect_on_dandisets
[Feature]: Direct inspection on Dandisets
2 parents 0e571dd + e08b023 commit 4993b6c

File tree

6 files changed

+220
-29
lines changed

6 files changed

+220
-29
lines changed

.github/workflows/testing.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,23 @@ jobs:
1010
os: ["ubuntu-latest", "macos-latest", "windows-latest"]
1111
python-version: ["3.7", "3.10"]
1212
steps:
13-
- uses: actions/checkout@v2
14-
- run: git fetch --prune --unshallow --tags
15-
- name: Setup Python ${{ matrix.python-version }}
16-
uses: actions/setup-python@v2
13+
- uses: s-weigand/setup-conda@v1
1714
with:
15+
update-conda: true
1816
python-version: ${{ matrix.python-version }}
17+
conda-channels: conda-forge
18+
- uses: actions/checkout@v2
19+
- run: git fetch --prune --unshallow --tags
1920
- name: Install pytest
2021
run: |
2122
pip install pytest
2223
pip install pytest-cov
2324
- name: Install package
2425
run: pip install -e .
26+
- name: Uninstall h5py
27+
run: pip uninstall -y h5py
28+
- name: Install ROS3
29+
run: conda install -c conda-forge h5py
2530
- name: Run pytest with coverage
2631
run: pytest --cov=./ --cov-report xml:./nwbinspector/nwbinspector/coverage.xml
2732
- if: ${{ matrix.python-version == '3.10' && matrix.os == 'ubuntu-latest' }}

nwbinspector/nwbinspector.py

Lines changed: 90 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Primary functions for inspecting NWBFiles."""
22
import os
3+
import re
34
import importlib
45
import traceback
56
import json
@@ -10,7 +11,7 @@
1011
from typing import Optional, List
1112
from concurrent.futures import ProcessPoolExecutor, as_completed
1213
from types import FunctionType
13-
from warnings import filterwarnings
14+
from warnings import filterwarnings, warn
1415
from distutils.util import strtobool
1516

1617
import click
@@ -26,6 +27,7 @@
2627
save_report,
2728
)
2829
from .register_checks import InspectorMessage, Importance
30+
from .tools import get_s3_urls_and_dandi_paths
2931
from .utils import FilePathType, PathType, OptionalListOfStrings
3032

3133
INTERNAL_CONFIGS = dict(dandi=Path(__file__).parent / "internal_configs" / "dandi.inspector_config.yaml")
@@ -186,6 +188,21 @@ def configure_checks(
186188
is_flag=True,
187189
)
188190
@click.option("--progress-bar", help="Set this flag to False to disable display of the progress bar.")
191+
@click.option(
192+
"--stream",
193+
help=(
194+
"Stream data from the DANDI archive. If the 'path' is a local copy of the target DANDISet, specifying this "
195+
"flag will still force the data to be streamed instead of using the local copy. To use the local copy, simply "
196+
"remove this flag. Requires the Read Only S3 (ros3) driver to be installed with h5py."
197+
),
198+
is_flag=True,
199+
)
200+
@click.option(
201+
"--version-id",
202+
help=(
203+
"When 'path' is a six-digit DANDISet ID, this further specifies which version of " "the DANDISet to inspect."
204+
),
205+
)
189206
def inspect_all_cli(
190207
path: str,
191208
modules: Optional[str] = None,
@@ -203,18 +220,34 @@ def inspect_all_cli(
203220
skip_validate: bool = False,
204221
detailed: bool = False,
205222
progress_bar: Optional[str] = None,
223+
stream: bool = False,
224+
version_id: Optional[str] = None,
206225
):
207226
"""
208227
Run the NWBInspector via the command line.
209228
210229
path :
211-
Path to either a local NWBFile, a local folder containing NWBFiles.
230+
Path to either a local NWBFile, a local folder containing NWBFiles, a link to a dataset on
231+
DANDI archive (i.e., https://dandiarchive.org/dandiset/{dandiset_id}/{version_id}), or a six-digit Dandiset ID.
212232
"""
213233
levels = ["importance", "file_path"] if levels is None else levels.split(",")
214234
reverse = [False] * len(levels) if reverse is None else [strtobool(x) for x in reverse.split(",")]
215235
progress_bar = strtobool(progress_bar) if progress_bar is not None else True
216236
if config is not None:
217237
config = load_config(filepath_or_keyword=config)
238+
if stream:
239+
url_path = path if path.startswith("https://") else None
240+
if url_path:
241+
dandiset_id, version_id = url_path.split("/")[-2:]
242+
path = dandiset_id
243+
assert url_path or re.fullmatch(
244+
pattern="^[0-9]{6}$", string=path
245+
), "'--stream' flag was enabled, but 'path' is neither a full link to the DANDI archive nor a DANDISet ID."
246+
if Path(path).is_dir():
247+
warn(
248+
f"The local DANDISet '{path}' exists, but the '--stream' flag was used. "
249+
"NWBInspector will use S3 streaming from DANDI. To use local data, remove the '--stream' flag."
250+
)
218251
messages = list(
219252
inspect_all(
220253
path=path,
@@ -226,6 +259,8 @@ def inspect_all_cli(
226259
n_jobs=n_jobs,
227260
skip_validate=skip_validate,
228261
progress_bar=progress_bar,
262+
stream=stream,
263+
version_id=version_id,
229264
)
230265
)
231266
if json_file_path is not None:
@@ -254,14 +289,17 @@ def inspect_all(
254289
skip_validate: bool = False,
255290
progress_bar: bool = True,
256291
progress_bar_options: Optional[dict] = None,
292+
stream: bool = False,
293+
version_id: Optional[str] = None,
257294
):
258295
"""
259296
Inspect a local NWBFile or folder of NWBFiles and return suggestions for improvements according to best practices.
260297
261298
Parameters
262299
----------
263300
path : PathType
264-
File path to an NWBFile, or folder path to iterate over recursively and scan all NWBFiles present.
301+
File path to an NWBFile, folder path to iterate over recursively and scan all NWBFiles present, or a
302+
six-digit identifier of the DANDISet.
265303
modules : list of strings, optional
266304
List of external module names to load; examples would be namespace extensions.
267305
These modules may also contain their own custom checks for their extensions.
@@ -294,22 +332,46 @@ def inspect_all(
294332
Defaults to True.
295333
progress_bar_options : dict, optional
296334
Dictionary of keyword arguments to pass directly to tqdm.
335+
stream : bool, optional
336+
Stream data from the DANDI archive. If the 'path' is a local copy of the target DANDISet, setting this
337+
argument to True will force the data to be streamed instead of using the local copy.
338+
Requires the Read Only S3 (ros3) driver to be installed with h5py.
339+
Defaults to False.
340+
version_id : str, optional
341+
If the path is a DANDISet ID, version_id additionally specifies which version of the dataset to read from.
342+
Common options are 'draft' or 'published'.
343+
Defaults to the most recent published version, or if not published then the most recent draft version.
297344
"""
298345
modules = modules or []
299346
if progress_bar_options is None:
300-
progress_bar_options = dict(desc="Inspecting NWBFiles...")
301-
in_path = Path(path)
302-
if in_path.is_dir():
303-
nwbfiles = list(in_path.rglob("*.nwb"))
304-
elif in_path.is_file():
305-
nwbfiles = [in_path]
347+
progress_bar_options = dict(position=0, leave=False)
348+
if stream:
349+
progress_bar_options.update(desc="Inspecting NWBFiles with ROS3...")
350+
else:
351+
progress_bar_options.update(desc="Inspecting NWBFiles...")
352+
if stream:
353+
assert (
354+
re.fullmatch(pattern="^[0-9]{6}$", string=str(path)) is not None
355+
), "'--stream' flag was enabled, but 'path' is not a DANDISet ID."
356+
driver = "ros3"
357+
nwbfiles = get_s3_urls_and_dandi_paths(dandiset_id=path, version_id=version_id, n_jobs=n_jobs)
306358
else:
307-
raise ValueError(f"{in_path} should be a directory or an NWB file.")
359+
driver = None
360+
in_path = Path(path)
361+
if in_path.is_dir():
362+
nwbfiles = list(in_path.rglob("*.nwb"))
363+
elif in_path.is_file():
364+
nwbfiles = [in_path]
365+
else:
366+
raise ValueError(f"{in_path} should be a directory or an NWB file.")
308367
for module in modules:
309368
importlib.import_module(module)
310369
# Filtering of checks should apply after external modules are imported, in case those modules have their own checks
311370
checks = configure_checks(config=config, ignore=ignore, select=select, importance_threshold=importance_threshold)
312371

372+
nwbfiles_iterable = nwbfiles
373+
if progress_bar:
374+
nwbfiles_iterable = tqdm(nwbfiles_iterable, **progress_bar_options)
313375
if n_jobs != 1:
314376
progress_bar_options.update(total=len(nwbfiles))
315377
futures = []
@@ -318,26 +380,34 @@ def inspect_all(
318380
for nwbfile_path in nwbfiles:
319381
futures.append(
320382
executor.submit(
321-
_pickle_inspect_nwb, nwbfile_path=nwbfile_path, checks=checks, skip_validate=skip_validate
383+
_pickle_inspect_nwb,
384+
nwbfile_path=nwbfile_path,
385+
checks=checks,
386+
skip_validate=skip_validate,
387+
driver=driver,
322388
)
323389
)
324-
completed_futures = as_completed(futures)
390+
nwbfiles_iterable = as_completed(futures)
325391
if progress_bar:
326-
completed_futures = tqdm(completed_futures, **progress_bar_options)
327-
for future in completed_futures:
392+
nwbfiles_iterable = tqdm(nwbfiles_iterable, **progress_bar_options)
393+
for future in nwbfiles_iterable:
328394
for message in future.result():
395+
if stream:
396+
message.file_path = nwbfiles[message.file_path]
329397
yield message
330398
else:
331-
if progress_bar:
332-
nwbfiles = tqdm(nwbfiles, **progress_bar_options)
333-
for nwbfile_path in nwbfiles:
334-
for message in inspect_nwb(nwbfile_path=nwbfile_path, checks=checks):
399+
for nwbfile_path in nwbfiles_iterable:
400+
for message in inspect_nwb(nwbfile_path=nwbfile_path, checks=checks, driver=driver):
401+
if stream:
402+
message.file_path = nwbfiles[message.file_path]
335403
yield message
336404

337405

338-
def _pickle_inspect_nwb(nwbfile_path: str, checks: list = available_checks, skip_validate: bool = False):
406+
def _pickle_inspect_nwb(
407+
nwbfile_path: str, checks: list = available_checks, skip_validate: bool = False, driver: Optional[str] = None
408+
):
339409
"""Auxilliary function for inspect_all to run in parallel using the ProcessPoolExecutor."""
340-
return list(inspect_nwb(nwbfile_path=nwbfile_path, checks=checks, skip_validate=skip_validate))
410+
return list(inspect_nwb(nwbfile_path=nwbfile_path, checks=checks, skip_validate=skip_validate, driver=driver))
341411

342412

343413
def inspect_nwb(

nwbinspector/register_checks.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,7 @@ class InspectorMessage:
7878
file_path: str = None
7979

8080
def __repr__(self):
81-
return ["InspectorMessage(\n"] + "\n,".join(
82-
+[f" {k}={v.__repr__()}" for k, v in self.__dict__.items()] + [")"]
83-
)
81+
return "InspectorMessage(\n" + ",\n".join([f" {k}={v.__repr__()}" for k, v in self.__dict__.items()]) + "\n)"
8482

8583

8684
# TODO: neurodata_type could have annotation hdmf.utils.ExtenderMeta, which seems to apply to all currently checked
@@ -175,7 +173,6 @@ def parse_location(neurodata_object) -> Optional[str]:
175173
for key, val in known_locations.items():
176174
if isinstance(neurodata_object, key):
177175
return val
178-
179176
"""Infer the human-readable path of the object within an NWBFile by tracing its parents."""
180177
if neurodata_object.parent is None:
181178
return "/"

nwbinspector/tools.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
"""Helper functions for internal use that rely on external dependencies (i.e., pynwb)."""
2+
import re
23
from uuid import uuid4
34
from datetime import datetime
5+
from typing import Optional, Dict
6+
from concurrent.futures import ProcessPoolExecutor, as_completed
47

58
from pynwb import NWBFile
9+
from dandi.dandiapi import DandiAPIClient
610

711

812
def make_minimal_nwbfile():
@@ -22,3 +26,43 @@ def get_nwbfile_path_from_internal_object(obj):
2226
if isinstance(obj, NWBFile):
2327
return obj.container_source
2428
return obj.get_ancestor("NWBFile").container_source
29+
30+
31+
def get_s3_urls_and_dandi_paths(dandiset_id: str, version_id: Optional[str] = None, n_jobs: int = 1) -> Dict[str, str]:
32+
"""
33+
Collect S3 URLS from a DANDISet ID.
34+
35+
Returns dictionary that maps each S3 url to the displayed file path on the DANDI archive content page.
36+
"""
37+
assert re.fullmatch(
38+
pattern="^[0-9]{6}$", string=dandiset_id
39+
), "The specified 'path' is not a proper DANDISet ID. It should be a six-digit numeric identifier."
40+
41+
s3_urls_to_dandi_paths = dict()
42+
if n_jobs != 1:
43+
with DandiAPIClient() as client:
44+
dandiset = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id)
45+
max_workers = n_jobs if n_jobs > 0 else None
46+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
47+
futures = []
48+
for asset in dandiset.get_assets():
49+
if asset.path.split(".")[-1] == "nwb":
50+
futures.append(
51+
executor.submit(
52+
_get_content_url_and_path, asset=asset, follow_redirects=1, strip_query=True
53+
)
54+
)
55+
for future in as_completed(futures):
56+
s3_urls_to_dandi_paths.update(future.result())
57+
else:
58+
with DandiAPIClient() as client:
59+
dandiset = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id)
60+
for asset in dandiset.get_assets():
61+
if asset.path.split(".")[-1] == "nwb":
62+
s3_urls_to_dandi_paths.update(_get_content_url_and_path(asset=asset))
63+
return s3_urls_to_dandi_paths
64+
65+
66+
def _get_content_url_and_path(asset, follow_redirects: int = 1, strip_query: bool = True) -> Dict[str, str]:
67+
"""Private helper function for parallelization in 'get_s3_urls_and_dandi_paths'."""
68+
return {asset.get_content_url(follow_redirects=1, strip_query=True): asset.path}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
packages=find_packages(),
1515
include_package_data=True,
1616
url="https://github.com/NeurodataWithoutBorders/nwbinspector",
17-
install_requires=["pynwb", "natsort", "click", "PyYAML", "jsonschema", "tqdm"],
17+
install_requires=["pynwb", "natsort", "click", "PyYAML", "jsonschema", "dandi", "tqdm"],
1818
entry_points={"console_scripts": ["nwbinspector=nwbinspector.nwbinspector:inspect_all_cli"]},
1919
license="BSD-3-Clause",
2020
)

0 commit comments

Comments
 (0)