Skip to content

Commit 89cb362

Browse files
committed
Merge remote-tracking branch 'origin/order_of_images' into order_of_images
2 parents 8b4a937 + abcff82 commit 89cb362

File tree

10 files changed

+166
-36
lines changed

10 files changed

+166
-36
lines changed

nwbinspector/checks/images.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@ def check_order_of_images_unique(images: Images):
1010
if images.order_of_images is None:
1111
return
1212
if not len(set(images.order_of_images)) == len(images.order_of_images):
13-
return InspectorMessage(
14-
message="order_of_images should have unique values."
15-
)
13+
return InspectorMessage(message="order_of_images should have unique values.")
1614

1715

1816
@register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=Images)
@@ -22,5 +20,5 @@ def check_order_of_images_len(images: Images):
2220
if not len(images.order_of_images) == len(images.images):
2321
return InspectorMessage(
2422
message=f"Length of order_of_images ({len(images.order_of_images)}) does not match the number of images ("
25-
f"{len(images.images)})."
26-
)
23+
f"{len(images.images)})."
24+
)

nwbinspector/nwbinspector.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
from pathlib import Path
99
from collections.abc import Iterable
1010
from enum import Enum
11-
from typing import Optional, List
11+
from typing import Union, Optional, List
1212
from concurrent.futures import ProcessPoolExecutor, as_completed
1313
from types import FunctionType
1414
from warnings import filterwarnings, warn
1515
from distutils.util import strtobool
16+
from time import sleep
1617

1718
import click
1819
import pynwb
@@ -28,7 +29,7 @@
2829
)
2930
from .register_checks import InspectorMessage, Importance
3031
from .tools import get_s3_urls_and_dandi_paths
31-
from .utils import FilePathType, PathType, OptionalListOfStrings
32+
from .utils import FilePathType, PathType, OptionalListOfStrings, robust_s3_read, calculate_number_of_cpu
3233

3334
INTERNAL_CONFIGS = dict(dandi=Path(__file__).parent / "internal_configs" / "dandi.inspector_config.yaml")
3435

@@ -278,7 +279,7 @@ def inspect_all(
278279
config: Optional[dict] = None,
279280
ignore: OptionalListOfStrings = None,
280281
select: OptionalListOfStrings = None,
281-
importance_threshold: Importance = Importance.BEST_PRACTICE_SUGGESTION,
282+
importance_threshold: Union[str, Importance] = Importance.BEST_PRACTICE_SUGGESTION,
282283
n_jobs: int = 1,
283284
skip_validate: bool = False,
284285
progress_bar: bool = True,
@@ -305,7 +306,7 @@ def inspect_all(
305306
Names of functions to skip.
306307
select: list of strings, optional
307308
Names of functions to pick out of available checks.
308-
importance_threshold : string, optional
309+
importance_threshold : string or Importance, optional
309310
Ignores tests with an assigned importance below this threshold.
310311
Importance has three levels:
311312
CRITICAL
@@ -317,6 +318,8 @@ def inspect_all(
317318
The default is the lowest level, BEST_PRACTICE_SUGGESTION.
318319
n_jobs : int
319320
Number of jobs to use in parallel. Set to -1 to use all available resources.
321+
This may also be a negative integer x from -2 to -(number_of_cpus - 1) which acts like negative slicing by using
322+
all available CPUs minus x.
320323
Set to 1 (also the default) to disable.
321324
skip_validate : bool, optional
322325
Skip the PyNWB validation step. This may be desired for older NWBFiles (< schema version v2.10).
@@ -336,7 +339,11 @@ def inspect_all(
336339
Common options are 'draft' or 'published'.
337340
Defaults to the most recent published version, or if not published then the most recent draft version.
338341
"""
342+
importance_threshold = (
343+
Importance[importance_threshold] if isinstance(importance_threshold, str) else importance_threshold
344+
)
339345
modules = modules or []
346+
n_jobs = calculate_number_of_cpu(requested_cpu=n_jobs)
340347
if progress_bar_options is None:
341348
progress_bar_options = dict(position=0, leave=False)
342349
if stream:
@@ -410,9 +417,10 @@ def inspect_nwb(
410417
config: dict = None,
411418
ignore: OptionalListOfStrings = None,
412419
select: OptionalListOfStrings = None,
413-
importance_threshold: Importance = Importance.BEST_PRACTICE_SUGGESTION,
414-
driver: str = None,
420+
importance_threshold: Union[str, Importance] = Importance.BEST_PRACTICE_SUGGESTION,
421+
driver: Optional[str] = None,
415422
skip_validate: bool = False,
423+
max_retries: int = 10,
416424
) -> List[InspectorMessage]:
417425
"""
418426
Inspect a NWBFile object and return suggestions for improvements according to best practices.
@@ -431,7 +439,7 @@ def inspect_nwb(
431439
Names of functions to skip.
432440
select: list, optional
433441
Names of functions to pick out of available checks.
434-
importance_threshold : string, optional
442+
importance_threshold : string or Importance, optional
435443
Ignores tests with an assigned importance below this threshold.
436444
Importance has three levels:
437445
CRITICAL
@@ -446,14 +454,23 @@ def inspect_nwb(
446454
skip_validate : bool
447455
Skip the PyNWB validation step. This may be desired for older NWBFiles (< schema version v2.10).
448456
The default is False, which is also recommended.
457+
max_retries : int, optional
458+
When using the ros3 driver to stream data from an s3 path, occasional curl issues can result.
459+
AWS suggests using iterative retry with an exponential backoff of 0.1 * 2^retries.
460+
This sets a hard bound on the number of times to attempt to retry the collection of messages.
461+
Defaults to 10 (corresponds to 102.4s maximum delay on final attempt).
449462
"""
463+
importance_threshold = (
464+
Importance[importance_threshold] if isinstance(importance_threshold, str) else importance_threshold
465+
)
450466
if any(x is not None for x in [config, ignore, select, importance_threshold]):
451467
checks = configure_checks(
452468
checks=checks, config=config, ignore=ignore, select=select, importance_threshold=importance_threshold
453469
)
454470
nwbfile_path = str(nwbfile_path)
455471
filterwarnings(action="ignore", message="No cached namespaces found in .*")
456472
filterwarnings(action="ignore", message="Ignoring cached namespace .*")
473+
457474
with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True, driver=driver) as io:
458475
if not skip_validate:
459476
validation_errors = pynwb.validate(io=io)
@@ -467,7 +484,7 @@ def inspect_nwb(
467484
)
468485

469486
try:
470-
nwbfile = io.read()
487+
nwbfile = robust_s3_read(command=io.read, max_retries=max_retries)
471488
for inspector_message in run_checks(nwbfile=nwbfile, checks=checks):
472489
inspector_message.file_path = nwbfile_path
473490
yield inspector_message
@@ -493,7 +510,7 @@ def run_checks(nwbfile: pynwb.NWBFile, checks: list):
493510
for nwbfile_object in nwbfile.objects.values():
494511
if check_function.neurodata_type is None or issubclass(type(nwbfile_object), check_function.neurodata_type):
495512
try:
496-
output = check_function(nwbfile_object)
513+
output = robust_s3_read(command=check_function, command_args=[nwbfile_object])
497514
# if an individual check fails, include it in the report and continue with the inspection
498515
except Exception:
499516
output = InspectorMessage(

nwbinspector/tools.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from pynwb import NWBFile
99

10-
from .utils import is_module_installed
10+
from .utils import is_module_installed, calculate_number_of_cpu
1111

1212

1313
def make_minimal_nwbfile():
@@ -43,6 +43,7 @@ def get_s3_urls_and_dandi_paths(dandiset_id: str, version_id: Optional[str] = No
4343
), "The specified 'path' is not a proper DANDISet ID. It should be a six-digit numeric identifier."
4444

4545
s3_urls_to_dandi_paths = dict()
46+
n_jobs = calculate_number_of_cpu(requested_cpu=n_jobs)
4647
if n_jobs != 1:
4748
with DandiAPIClient() as client:
4849
dandiset = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id)

nwbinspector/utils.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
"""Commonly reused logic for evaluating conditions; must not have external dependencies."""
2+
import os
23
import re
34
import json
45
import numpy as np
5-
from typing import TypeVar, Optional, List
6+
from typing import TypeVar, Optional, List, Dict, Callable
67
from pathlib import Path
78
from importlib import import_module
89
from packaging import version
10+
from time import sleep
911

1012
PathType = TypeVar("PathType", str, Path) # For types that can be either files or folders
1113
FilePathType = TypeVar("FilePathType", str, Path)
@@ -113,3 +115,41 @@ def get_package_version(name: str) -> version.Version:
113115

114116
package_version = get_distribution(name).version
115117
return version.parse(package_version)
118+
119+
120+
def robust_s3_read(
121+
command: Callable, max_retries: int = 10, command_args: Optional[list] = None, command_kwargs: Optional[Dict] = None
122+
):
123+
"""Attempt the command (usually acting on an S3 IO) up to the number of max_retries using exponential backoff."""
124+
command_args = command_args or []
125+
command_kwargs = command_kwargs or dict()
126+
for retry in range(max_retries):
127+
try:
128+
return command(*command_args, **command_kwargs)
129+
except OSError: # cannot curl request
130+
sleep(0.1 * 2**retry)
131+
except Exception as exc:
132+
raise exc
133+
raise TimeoutError(f"Unable to complete the command ({command.__name__}) after {max_retries} attempts!")
134+
135+
136+
def calculate_number_of_cpu(requested_cpu: int = 1) -> int:
137+
"""
138+
Calculate the number CPUs to use with respect to negative slicing and check against maximal available resources.
139+
140+
Parameters
141+
----------
142+
requested_cpu : int, optional
143+
The desired number of CPUs to use.
144+
145+
The default is 1.
146+
"""
147+
total_cpu = os.cpu_count()
148+
assert requested_cpu <= total_cpu, f"Requested more CPUs ({requested_cpu}) than are available ({total_cpu})!"
149+
assert requested_cpu >= -(
150+
total_cpu - 1
151+
), f"Requested fewer CPUs ({requested_cpu}) than are available ({total_cpu})!"
152+
if requested_cpu > 0:
153+
return requested_cpu
154+
else:
155+
return total_cpu + requested_cpu

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
long_description = f.read()
66
setup(
77
name="nwbinspector",
8-
version="0.4.6",
8+
version="0.4.7",
99
description="Tool to inspect NWB files for best practices compliance.",
1010
long_description=long_description,
1111
long_description_content_type="text/markdown",

tests/test_inspector.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ def test_inspect_nwb(self):
330330
]
331331
self.assertCountEqual(first=test_results, second=true_results)
332332

333-
def test_inspect_nwb_importance_threshold(self):
333+
def test_inspect_nwb_importance_threshold_as_importance(self):
334334
test_results = list(
335335
inspect_nwb(
336336
nwbfile_path=self.nwbfile_paths[0], checks=self.checks, importance_threshold=Importance.CRITICAL
@@ -361,6 +361,35 @@ def test_inspect_nwb_importance_threshold(self):
361361
]
362362
self.assertCountEqual(first=test_results, second=true_results)
363363

364+
def test_inspect_nwb_importance_threshold_as_string(self):
365+
test_results = list(
366+
inspect_nwb(nwbfile_path=self.nwbfile_paths[0], checks=self.checks, importance_threshold="CRITICAL")
367+
)
368+
true_results = [
369+
InspectorMessage(
370+
message=(
371+
"Data may be in the wrong orientation. Time should be in the first dimension, and is "
372+
"usually the longest dimension. Here, another dimension is longer."
373+
),
374+
importance=Importance.CRITICAL,
375+
check_function_name="check_data_orientation",
376+
object_type="SpatialSeries",
377+
object_name="my_spatial_series",
378+
location="/processing/behavior/Position/my_spatial_series",
379+
file_path=self.nwbfile_paths[0],
380+
),
381+
InspectorMessage(
382+
message="The length of the first dimension of data does not match the length of timestamps.",
383+
importance=Importance.CRITICAL,
384+
check_function_name="check_timestamps_match_first_dimension",
385+
object_type="TimeSeries",
386+
object_name="test_time_series_3",
387+
location="/acquisition/test_time_series_3",
388+
file_path=self.nwbfile_paths[0],
389+
),
390+
]
391+
self.assertCountEqual(first=test_results, second=true_results)
392+
364393
def test_command_line_runs_cli_only(self):
365394
console_output_file = self.tempdir / "test_console_output.txt"
366395
os.system(

tests/test_utils.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
1+
import os
12
from packaging import version
23

34
from hdmf.testing import TestCase
45

56
from nwbinspector import Importance
6-
from nwbinspector.utils import format_byte_size, check_regular_series, is_dict_in_string, get_package_version
7+
from nwbinspector.utils import (
8+
format_byte_size,
9+
check_regular_series,
10+
is_dict_in_string,
11+
get_package_version,
12+
calculate_number_of_cpu,
13+
)
714

815

916
def test_format_byte_size():
@@ -104,3 +111,27 @@ def test_get_package_version_type():
104111

105112
def test_get_package_version_value():
106113
assert get_package_version("hdmf") >= version.parse("3.1.1") # minimum supported PyNWB version
114+
115+
116+
class TestCalulcateNumberOfCPU(TestCase):
117+
total_cpu = os.cpu_count()
118+
119+
def test_request_more_than_available_assert(self):
120+
requested_cpu = 2500
121+
with self.assertRaisesWith(
122+
exc_type=AssertionError,
123+
exc_msg=f"Requested more CPUs ({requested_cpu}) than are available ({self.total_cpu})!",
124+
):
125+
calculate_number_of_cpu(requested_cpu=requested_cpu)
126+
127+
def test_request_fewer_than_available_assert(self):
128+
requested_cpu = -2500
129+
with self.assertRaisesWith(
130+
exc_type=AssertionError,
131+
exc_msg=f"Requested fewer CPUs ({requested_cpu}) than are available ({self.total_cpu})!",
132+
):
133+
calculate_number_of_cpu(requested_cpu=requested_cpu)
134+
135+
def test_calculate_number_of_cpu_negative_value(self):
136+
requested_cpu = -1 # CI only has 2 jobs available
137+
assert calculate_number_of_cpu(requested_cpu=requested_cpu) == requested_cpu % self.total_cpu

tests/unit_tests/test_images.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,3 @@ def test_pass_check_order_of_images_len():
5555
images = Images(name="my_images", images=imgs, order_of_images=img_refs)
5656

5757
assert check_order_of_images_len(images) is None
58-
59-
60-

tests/unit_tests/test_tables.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import platform
22
import json
33
from unittest import TestCase
4+
from packaging import version
45

56
import numpy as np
67
from hdmf.common import DynamicTable, DynamicTableRegion
@@ -17,6 +18,7 @@
1718
check_single_row,
1819
check_table_values_for_dict,
1920
)
21+
from nwbinspector.utils import get_package_version
2022

2123

2224
class TestCheckDynamicTableRegion(TestCase):
@@ -237,16 +239,27 @@ def test_check_single_row_ignore_electrodes():
237239
table = ElectrodeTable(
238240
name="electrodes", # default name when building through nwbfile
239241
)
240-
table.add_row(
241-
x=np.nan,
242-
y=np.nan,
243-
z=np.nan,
244-
imp=np.nan,
245-
location="unknown",
246-
filtering="unknown",
247-
group=ElectrodeGroup(name="test_group", description="", device=Device(name="test_device"), location="unknown"),
248-
group_name="test_group",
249-
)
242+
if get_package_version(name="pynwb") >= version.Version("2.1.0"):
243+
table.add_row(
244+
location="unknown",
245+
group=ElectrodeGroup(
246+
name="test_group", description="", device=Device(name="test_device"), location="unknown"
247+
),
248+
group_name="test_group",
249+
)
250+
else:
251+
table.add_row(
252+
x=np.nan,
253+
y=np.nan,
254+
z=np.nan,
255+
imp=np.nan,
256+
location="unknown",
257+
filtering="unknown",
258+
group=ElectrodeGroup(
259+
name="test_group", description="", device=Device(name="test_device"), location="unknown"
260+
),
261+
group_name="test_group",
262+
)
250263
assert check_single_row(table=table) is None
251264

252265

0 commit comments

Comments
 (0)