Merge remote-tracking branch 'origin/order_of_images' into order_of_images

bendichter · bendichter · commit 89cb362b000f · 2022-07-11T18:14:46.000-04:00
diff --git a/nwbinspector/checks/images.py b/nwbinspector/checks/images.py
@@ -10,9 +10,7 @@ def check_order_of_images_unique(images: Images):
     if images.order_of_images is None:
         return
     if not len(set(images.order_of_images)) == len(images.order_of_images):
-        return InspectorMessage(
-            message="order_of_images should have unique values."
-        )
+        return InspectorMessage(message="order_of_images should have unique values.")
 
 
 @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=Images)
@@ -22,5 +20,5 @@ def check_order_of_images_len(images: Images):
     if not len(images.order_of_images) == len(images.images):
         return InspectorMessage(
             message=f"Length of order_of_images ({len(images.order_of_images)}) does not match the number of images ("
-                    f"{len(images.images)})."
-        )
+            f"{len(images.images)})."
+        )
diff --git a/nwbinspector/nwbinspector.py b/nwbinspector/nwbinspector.py
@@ -8,11 +8,12 @@
 from pathlib import Path
 from collections.abc import Iterable
 from enum import Enum
-from typing import Optional, List
+from typing import Union, Optional, List
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from types import FunctionType
 from warnings import filterwarnings, warn
 from distutils.util import strtobool
+from time import sleep
 
 import click
 import pynwb
@@ -28,7 +29,7 @@
 )
 from .register_checks import InspectorMessage, Importance
 from .tools import get_s3_urls_and_dandi_paths
-from .utils import FilePathType, PathType, OptionalListOfStrings
+from .utils import FilePathType, PathType, OptionalListOfStrings, robust_s3_read, calculate_number_of_cpu
 
 INTERNAL_CONFIGS = dict(dandi=Path(__file__).parent / "internal_configs" / "dandi.inspector_config.yaml")
 
@@ -278,7 +279,7 @@ def inspect_all(
     config: Optional[dict] = None,
     ignore: OptionalListOfStrings = None,
     select: OptionalListOfStrings = None,
-    importance_threshold: Importance = Importance.BEST_PRACTICE_SUGGESTION,
+    importance_threshold: Union[str, Importance] = Importance.BEST_PRACTICE_SUGGESTION,
     n_jobs: int = 1,
     skip_validate: bool = False,
     progress_bar: bool = True,
@@ -305,7 +306,7 @@ def inspect_all(
         Names of functions to skip.
     select: list of strings, optional
         Names of functions to pick out of available checks.
-    importance_threshold : string, optional
+    importance_threshold : string or Importance, optional
         Ignores tests with an assigned importance below this threshold.
         Importance has three levels:
             CRITICAL
@@ -317,6 +318,8 @@ def inspect_all(
         The default is the lowest level, BEST_PRACTICE_SUGGESTION.
     n_jobs : int
         Number of jobs to use in parallel. Set to -1 to use all available resources.
+        This may also be a negative integer x from -2 to -(number_of_cpus - 1) which acts like negative slicing by using
+        all available CPUs minus x.
         Set to 1 (also the default) to disable.
     skip_validate : bool, optional
         Skip the PyNWB validation step. This may be desired for older NWBFiles (< schema version v2.10).
@@ -336,7 +339,11 @@ def inspect_all(
         Common options are 'draft' or 'published'.
         Defaults to the most recent published version, or if not published then the most recent draft version.
     """
+    importance_threshold = (
+        Importance[importance_threshold] if isinstance(importance_threshold, str) else importance_threshold
+    )
     modules = modules or []
+    n_jobs = calculate_number_of_cpu(requested_cpu=n_jobs)
     if progress_bar_options is None:
         progress_bar_options = dict(position=0, leave=False)
         if stream:
@@ -410,9 +417,10 @@ def inspect_nwb(
     config: dict = None,
     ignore: OptionalListOfStrings = None,
     select: OptionalListOfStrings = None,
-    importance_threshold: Importance = Importance.BEST_PRACTICE_SUGGESTION,
-    driver: str = None,
+    importance_threshold: Union[str, Importance] = Importance.BEST_PRACTICE_SUGGESTION,
+    driver: Optional[str] = None,
     skip_validate: bool = False,
+    max_retries: int = 10,
 ) -> List[InspectorMessage]:
     """
     Inspect a NWBFile object and return suggestions for improvements according to best practices.
@@ -431,7 +439,7 @@ def inspect_nwb(
         Names of functions to skip.
     select: list, optional
         Names of functions to pick out of available checks.
-    importance_threshold : string, optional
+    importance_threshold : string or Importance, optional
         Ignores tests with an assigned importance below this threshold.
         Importance has three levels:
             CRITICAL
@@ -446,14 +454,23 @@ def inspect_nwb(
     skip_validate : bool
         Skip the PyNWB validation step. This may be desired for older NWBFiles (< schema version v2.10).
         The default is False, which is also recommended.
+    max_retries : int, optional
+        When using the ros3 driver to stream data from an s3 path, occasional curl issues can result.
+        AWS suggests using iterative retry with an exponential backoff of 0.1 * 2^retries.
+        This sets a hard bound on the number of times to attempt to retry the collection of messages.
+        Defaults to 10 (corresponds to 102.4s maximum delay on final attempt).
     """
+    importance_threshold = (
+        Importance[importance_threshold] if isinstance(importance_threshold, str) else importance_threshold
+    )
     if any(x is not None for x in [config, ignore, select, importance_threshold]):
         checks = configure_checks(
             checks=checks, config=config, ignore=ignore, select=select, importance_threshold=importance_threshold
         )
     nwbfile_path = str(nwbfile_path)
     filterwarnings(action="ignore", message="No cached namespaces found in .*")
     filterwarnings(action="ignore", message="Ignoring cached namespace .*")
+
     with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True, driver=driver) as io:
         if not skip_validate:
             validation_errors = pynwb.validate(io=io)
@@ -467,7 +484,7 @@ def inspect_nwb(
                 )
 
         try:
-            nwbfile = io.read()
+            nwbfile = robust_s3_read(command=io.read, max_retries=max_retries)
             for inspector_message in run_checks(nwbfile=nwbfile, checks=checks):
                 inspector_message.file_path = nwbfile_path
                 yield inspector_message
@@ -493,7 +510,7 @@ def run_checks(nwbfile: pynwb.NWBFile, checks: list):
         for nwbfile_object in nwbfile.objects.values():
             if check_function.neurodata_type is None or issubclass(type(nwbfile_object), check_function.neurodata_type):
                 try:
-                    output = check_function(nwbfile_object)
+                    output = robust_s3_read(command=check_function, command_args=[nwbfile_object])
                 # if an individual check fails, include it in the report and continue with the inspection
                 except Exception:
                     output = InspectorMessage(
diff --git a/nwbinspector/tools.py b/nwbinspector/tools.py
@@ -7,7 +7,7 @@
 
 from pynwb import NWBFile
 
-from .utils import is_module_installed
+from .utils import is_module_installed, calculate_number_of_cpu
 
 
 def make_minimal_nwbfile():
@@ -43,6 +43,7 @@ def get_s3_urls_and_dandi_paths(dandiset_id: str, version_id: Optional[str] = No
     ), "The specified 'path' is not a proper DANDISet ID. It should be a six-digit numeric identifier."
 
     s3_urls_to_dandi_paths = dict()
+    n_jobs = calculate_number_of_cpu(requested_cpu=n_jobs)
     if n_jobs != 1:
         with DandiAPIClient() as client:
             dandiset = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id)
diff --git a/nwbinspector/utils.py b/nwbinspector/utils.py
@@ -1,11 +1,13 @@
 """Commonly reused logic for evaluating conditions; must not have external dependencies."""
+import os
 import re
 import json
 import numpy as np
-from typing import TypeVar, Optional, List
+from typing import TypeVar, Optional, List, Dict, Callable
 from pathlib import Path
 from importlib import import_module
 from packaging import version
+from time import sleep
 
 PathType = TypeVar("PathType", str, Path)  # For types that can be either files or folders
 FilePathType = TypeVar("FilePathType", str, Path)
@@ -113,3 +115,41 @@ def get_package_version(name: str) -> version.Version:
 
         package_version = get_distribution(name).version
     return version.parse(package_version)
+
+
+def robust_s3_read(
+    command: Callable, max_retries: int = 10, command_args: Optional[list] = None, command_kwargs: Optional[Dict] = None
+):
+    """Attempt the command (usually acting on an S3 IO) up to the number of max_retries using exponential backoff."""
+    command_args = command_args or []
+    command_kwargs = command_kwargs or dict()
+    for retry in range(max_retries):
+        try:
+            return command(*command_args, **command_kwargs)
+        except OSError:  # cannot curl request
+            sleep(0.1 * 2**retry)
+        except Exception as exc:
+            raise exc
+    raise TimeoutError(f"Unable to complete the command ({command.__name__}) after {max_retries} attempts!")
+
+
+def calculate_number_of_cpu(requested_cpu: int = 1) -> int:
+    """
+    Calculate the number CPUs to use with respect to negative slicing and check against maximal available resources.
+
+    Parameters
+    ----------
+    requested_cpu : int, optional
+        The desired number of CPUs to use.
+
+        The default is 1.
+    """
+    total_cpu = os.cpu_count()
+    assert requested_cpu <= total_cpu, f"Requested more CPUs ({requested_cpu}) than are available ({total_cpu})!"
+    assert requested_cpu >= -(
+        total_cpu - 1
+    ), f"Requested fewer CPUs ({requested_cpu}) than are available ({total_cpu})!"
+    if requested_cpu > 0:
+        return requested_cpu
+    else:
+        return total_cpu + requested_cpu
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     long_description = f.read()
 setup(
     name="nwbinspector",
-    version="0.4.6",
+    version="0.4.7",
     description="Tool to inspect NWB files for best practices compliance.",
     long_description=long_description,
     long_description_content_type="text/markdown",
diff --git a/tests/test_inspector.py b/tests/test_inspector.py
@@ -330,7 +330,7 @@ def test_inspect_nwb(self):
         ]
         self.assertCountEqual(first=test_results, second=true_results)
 
-    def test_inspect_nwb_importance_threshold(self):
+    def test_inspect_nwb_importance_threshold_as_importance(self):
         test_results = list(
             inspect_nwb(
                 nwbfile_path=self.nwbfile_paths[0], checks=self.checks, importance_threshold=Importance.CRITICAL
@@ -361,6 +361,35 @@ def test_inspect_nwb_importance_threshold(self):
         ]
         self.assertCountEqual(first=test_results, second=true_results)
 
+    def test_inspect_nwb_importance_threshold_as_string(self):
+        test_results = list(
+            inspect_nwb(nwbfile_path=self.nwbfile_paths[0], checks=self.checks, importance_threshold="CRITICAL")
+        )
+        true_results = [
+            InspectorMessage(
+                message=(
+                    "Data may be in the wrong orientation. Time should be in the first dimension, and is "
+                    "usually the longest dimension. Here, another dimension is longer."
+                ),
+                importance=Importance.CRITICAL,
+                check_function_name="check_data_orientation",
+                object_type="SpatialSeries",
+                object_name="my_spatial_series",
+                location="/processing/behavior/Position/my_spatial_series",
+                file_path=self.nwbfile_paths[0],
+            ),
+            InspectorMessage(
+                message="The length of the first dimension of data does not match the length of timestamps.",
+                importance=Importance.CRITICAL,
+                check_function_name="check_timestamps_match_first_dimension",
+                object_type="TimeSeries",
+                object_name="test_time_series_3",
+                location="/acquisition/test_time_series_3",
+                file_path=self.nwbfile_paths[0],
+            ),
+        ]
+        self.assertCountEqual(first=test_results, second=true_results)
+
     def test_command_line_runs_cli_only(self):
         console_output_file = self.tempdir / "test_console_output.txt"
         os.system(
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,9 +1,16 @@
+import os
 from packaging import version
 
 from hdmf.testing import TestCase
 
 from nwbinspector import Importance
-from nwbinspector.utils import format_byte_size, check_regular_series, is_dict_in_string, get_package_version
+from nwbinspector.utils import (
+    format_byte_size,
+    check_regular_series,
+    is_dict_in_string,
+    get_package_version,
+    calculate_number_of_cpu,
+)
 
 
 def test_format_byte_size():
@@ -104,3 +111,27 @@ def test_get_package_version_type():
 
 def test_get_package_version_value():
     assert get_package_version("hdmf") >= version.parse("3.1.1")  # minimum supported PyNWB version
+
+
+class TestCalulcateNumberOfCPU(TestCase):
+    total_cpu = os.cpu_count()
+
+    def test_request_more_than_available_assert(self):
+        requested_cpu = 2500
+        with self.assertRaisesWith(
+            exc_type=AssertionError,
+            exc_msg=f"Requested more CPUs ({requested_cpu}) than are available ({self.total_cpu})!",
+        ):
+            calculate_number_of_cpu(requested_cpu=requested_cpu)
+
+    def test_request_fewer_than_available_assert(self):
+        requested_cpu = -2500
+        with self.assertRaisesWith(
+            exc_type=AssertionError,
+            exc_msg=f"Requested fewer CPUs ({requested_cpu}) than are available ({self.total_cpu})!",
+        ):
+            calculate_number_of_cpu(requested_cpu=requested_cpu)
+
+    def test_calculate_number_of_cpu_negative_value(self):
+        requested_cpu = -1  # CI only has 2 jobs available
+        assert calculate_number_of_cpu(requested_cpu=requested_cpu) == requested_cpu % self.total_cpu
diff --git a/tests/unit_tests/test_images.py b/tests/unit_tests/test_images.py
@@ -55,6 +55,3 @@ def test_pass_check_order_of_images_len():
     images = Images(name="my_images", images=imgs, order_of_images=img_refs)
 
     assert check_order_of_images_len(images) is None
-
-
-
diff --git a/tests/unit_tests/test_tables.py b/tests/unit_tests/test_tables.py
@@ -1,6 +1,7 @@
 import platform
 import json
 from unittest import TestCase
+from packaging import version
 
 import numpy as np
 from hdmf.common import DynamicTable, DynamicTableRegion
@@ -17,6 +18,7 @@
     check_single_row,
     check_table_values_for_dict,
 )
+from nwbinspector.utils import get_package_version
 
 
 class TestCheckDynamicTableRegion(TestCase):
@@ -237,16 +239,27 @@ def test_check_single_row_ignore_electrodes():
     table = ElectrodeTable(
         name="electrodes",  # default name when building through nwbfile
     )
-    table.add_row(
-        x=np.nan,
-        y=np.nan,
-        z=np.nan,
-        imp=np.nan,
-        location="unknown",
-        filtering="unknown",
-        group=ElectrodeGroup(name="test_group", description="", device=Device(name="test_device"), location="unknown"),
-        group_name="test_group",
-    )
+    if get_package_version(name="pynwb") >= version.Version("2.1.0"):
+        table.add_row(
+            location="unknown",
+            group=ElectrodeGroup(
+                name="test_group", description="", device=Device(name="test_device"), location="unknown"
+            ),
+            group_name="test_group",
+        )
+    else:
+        table.add_row(
+            x=np.nan,
+            y=np.nan,
+            z=np.nan,
+            imp=np.nan,
+            location="unknown",
+            filtering="unknown",
+            group=ElectrodeGroup(
+                name="test_group", description="", device=Device(name="test_device"), location="unknown"
+            ),
+            group_name="test_group",
+        )
     assert check_single_row(table=table) is None
 
 
diff --git a/tests/unit_tests/test_time_series.py b/tests/unit_tests/test_time_series.py