Add caching for repeated table data calls (#230)

CodyCBakerPhD · Cody Baker · pre-commit-ci[bot] · web-flow · commit 2ee62e2a5750 · 2022-07-18T19:20:27.000-04:00
* added utility for easy data caching * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * debugged * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more general debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * swap to using selections and h5py Datasets * Update nwbinspector/utils.py Co-authored-by: Ben Dichter <ben.dichter@gmail.com> * support tuple caching * support array slicing for cached data selection * fixed hashing issues; seeing if table message is still changed * fix table test * fixed tests? Co-authored-by: Cody Baker <codycbakerphd@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ben Dichter <ben.dichter@gmail.com>
diff --git a/nwbinspector/checks/tables.py b/nwbinspector/checks/tables.py
@@ -9,6 +9,7 @@
 
 from ..register_checks import register_check, InspectorMessage, Importance
 from ..utils import (
+    _cache_data_selection,
     format_byte_size,
     is_ascending_series,
     is_dict_in_string,
@@ -55,7 +56,7 @@ def check_time_interval_time_columns(time_intervals: TimeIntervals, nelems: int
     unsorted_cols = []
     for column in time_intervals.columns:
         if column.name[-5:] == "_time":
-            if not is_ascending_series(column, nelems):
+            if not is_ascending_series(column.data, nelems):
                 unsorted_cols.append(column.name)
     if unsorted_cols:
         return InspectorMessage(
@@ -79,7 +80,11 @@ def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems:
         very long so you don't need to load the entire array into memory. Use None to
         load the entire arrays.
     """
-    if np.any(np.asarray(time_intervals["stop_time"][:nelems]) - np.asarray(time_intervals["start_time"][:nelems]) < 0):
+    if np.any(
+        np.asarray(_cache_data_selection(data=time_intervals["stop_time"].data, selection=slice(nelems)))
+        - np.asarray(_cache_data_selection(data=time_intervals["start_time"].data, selection=slice(nelems)))
+        < 0
+    ):
         return InspectorMessage(
             message=(
                 "stop_times should be greater than start_times. Make sure the stop times are with respect to the "
@@ -106,7 +111,7 @@ def check_column_binary_capability(table: DynamicTable, nelems: int = 200):
             if np.asarray(column.data[0]).itemsize == 1:
                 continue  # already boolean, int8, or uint8
             try:
-                unique_values = np.unique(column.data[:nelems])
+                unique_values = np.unique(_cache_data_selection(data=column.data, selection=slice(nelems)))
             except TypeError:  # some contained objects are unhashable or have no comparison defined
                 continue
             if unique_values.size != 2:
@@ -174,7 +179,7 @@ def check_table_values_for_dict(table: DynamicTable, nelems: int = 200):
     for column in table.columns:
         if not hasattr(column, "data") or isinstance(column, VectorIndex) or not isinstance(column.data[0], str):
             continue
-        for string in column.data[:nelems]:
+        for string in _cache_data_selection(data=column.data, selection=slice(nelems)):
             if is_dict_in_string(string=string):
                 message = (
                     f"The column '{column.name}' contains a string value that contains a dictionary! Please "
diff --git a/nwbinspector/utils.py b/nwbinspector/utils.py
@@ -2,18 +2,48 @@
 import os
 import re
 import json
-import numpy as np
-from typing import TypeVar, Optional, List, Dict, Callable
+from typing import TypeVar, Union, Optional, List, Dict, Callable, Tuple
 from pathlib import Path
 from importlib import import_module
 from packaging import version
 from time import sleep
+from functools import lru_cache
+
+import h5py
+import numpy as np
+from numpy.typing import ArrayLike
+
 
 PathType = TypeVar("PathType", str, Path)  # For types that can be either files or folders
 FilePathType = TypeVar("FilePathType", str, Path)
 OptionalListOfStrings = Optional[List[str]]
 
 dict_regex = r"({.+:.+})"
+MAX_CACHE_ITEMS = 1000  # lru_cache default is 128 calls of matching input/output, but might need more to get use here
+
+
+@lru_cache(maxsize=MAX_CACHE_ITEMS)
+def _cache_data_retrieval_command(
+    data: h5py.Dataset, reduced_selection: Tuple[Tuple[Optional[int], Optional[int], Optional[int]]]
+) -> np.ndarray:
+    """LRU caching for _cache_data_selection cannot be applied to list inputs; this expects the tuple or Dataset."""
+    selection = tuple([slice(*reduced_slice) for reduced_slice in reduced_selection])  # reconstitute the slices
+    return data[selection]
+
+
+def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union[slice, Tuple[slice]]) -> np.ndarray:
+    """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming)."""
+    if isinstance(data, np.memmap):  # Technically np.memmap should be able to support this type of behavior as well
+        return data[selection]  # But they aren't natively hashable either...
+    if not isinstance(data, h5py.Dataset):  # No need to attempt to cache if already an in-memory object
+        return np.array(data)[selection]
+
+    # slices also aren't hashable, but their reduced representation is
+    if isinstance(selection, slice):  # If a single slice
+        reduced_selection = tuple([selection.__reduce__()[1]])  # if a single slice
+    else:
+        reduced_selection = tuple([selection_slice.__reduce__()[1] for selection_slice in selection])
+    return _cache_data_retrieval_command(data=data, reduced_selection=reduced_selection)
 
 
 def format_byte_size(byte_size: int, units: str = "SI"):
@@ -52,9 +82,12 @@ def check_regular_series(series: np.ndarray, tolerance_decimals: int = 9):
     return len(uniq_diff_ts) == 1
 
 
-def is_ascending_series(series: np.ndarray, nelems=None):
+def is_ascending_series(series: Union[h5py.Dataset, ArrayLike], nelems=None):
     """General purpose function for determining if a series is monotonic increasing."""
-    return np.all(np.diff(series[:nelems]) > 0)
+    if isinstance(series, h5py.Dataset):
+        return np.all(np.diff(_cache_data_selection(data=series, selection=slice(nelems))) > 0)
+    else:
+        return np.all(np.diff(series[:nelems]) > 0)  # already in memory, no need to cache
 
 
 def is_dict_in_string(string: str):
diff --git a/tests/unit_tests/test_tables.py b/tests/unit_tests/test_tables.py
@@ -183,8 +183,8 @@ def test_binary_int_fail(self):
         assert check_column_binary_capability(table=self.table) == [
             InspectorMessage(
                 message=(
-                    "Column 'test_col' uses 'integers' but has binary values [0 1]. Consider making it boolean instead "
-                    f"and renaming the column to start with 'is_'; doing so will save {platform_saved_bytes}."
+                    "Column 'test_col' uses 'integers' but has binary values [0 1]. Consider making it boolean "
+                    f"instead and renaming the column to start with 'is_'; doing so will save {platform_saved_bytes}."
                 ),
                 importance=Importance.BEST_PRACTICE_SUGGESTION,
                 check_function_name="check_column_binary_capability",
diff --git a/tests/unit_tests/test_time_series.py b/tests/unit_tests/test_time_series.py
@@ -133,7 +133,12 @@ def test_check_timestamps_empty_timestamps():
     )
 
 
-def test_check_timestamps_ascending():
+def test_pass_check_timestamps_ascending_pass():
+    time_series = pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[1, 2, 3], timestamps=[1, 2, 3])
+    assert check_timestamps_ascending(time_series) is None
+
+
+def test_check_timestamps_ascending_fail():
     time_series = pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[1, 2, 3], timestamps=[1, 3, 2])
     assert check_timestamps_ascending(time_series) == InspectorMessage(
         message="test_time_series timestamps are not ascending.",
@@ -145,11 +150,6 @@ def test_check_timestamps_ascending():
     )
 
 
-def test_pass_check_timestamps_ascending():
-    time_series = pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[1, 2, 3], timestamps=[1, 2, 3])
-    assert check_timestamps_ascending(time_series) is None
-
-
 def test_check_missing_unit_pass():
     time_series = pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[1, 2, 3], timestamps=[1, 2, 3])
     assert check_missing_unit(time_series) is None
@@ -169,7 +169,7 @@ def test_check_missing_unit_fail():
 
 def test_check_positive_resolution_pass():
     time_series = pynwb.TimeSeries(name="test", unit="test_units", data=[1, 2, 3], timestamps=[1, 2, 3], resolution=3.4)
-    assert check_timestamps_ascending(time_series) is None
+    assert check_resolution(time_series) is None
 
 
 def test_check_unknown_resolution_pass():