Merge pull request #570 from catalystneuro/new_backend_add_backend_configuration_tool

CodyCBakerPhD · web-flow · commit a4b157fee019 · 2023-11-28T14:41:17.000-05:00
[Backend Configuration IIb] Add backend collection tools
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
   The name of the plane segmentation is used to determine which traces to add to the `Fluorescence` and `DfOverF` containers. [PR #632](https://github.com/catalystneuro/neuroconv/pull/632)
 * Modify the filtering of traces to also filter out traces with empty values. [PR #649](https://github.com/catalystneuro/neuroconv/pull/649)
 * Added tool function `get_default_dataset_configurations` for identifying and collecting all fields of an in-memory `NWBFile` that could become datasets on disk; and return instances of the Pydantic dataset models filled with default values for chunking/buffering/compression. [PR #569](https://github.com/catalystneuro/neuroconv/pull/569)
+* Added tool function `get_default_backend_configuration` for conveniently packaging the results of `get_default_dataset_configurations` into an easy-to-modify mapping from locations of objects within the file to their correseponding dataset configuration options, as well as linking to a specific backend DataIO. [PR #570](https://github.com/catalystneuro/neuroconv/pull/570)
 
 ### Fixes
 * Fixed GenericDataChunkIterator (in hdmf.py) in the case where the number of dimensions is 1 and the size in bytes is greater than the threshold of 1 GB. [PR #638](https://github.com/catalystneuro/neuroconv/pull/638)
diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py
@@ -1,4 +1,8 @@
-from ._dataset_configuration import get_default_dataset_io_configurations
+"""Collection of Pydantic models and helper functions for configuring dataset IO parameters for different backends."""
+from ._dataset_configuration import (
+    get_default_backend_configuration,
+    get_default_dataset_io_configurations,
+)
 from ._metadata_and_file_helpers import (
     add_device_from_metadata,
     get_default_nwbfile_metadata,
diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
@@ -103,7 +103,6 @@ def get_default_dataset_io_configurations(
             dynamic_table = neurodata_object  # for readability
 
             for column in dynamic_table.columns:
-                column_name = column.name
                 candidate_dataset = column.data  # VectorData object
                 if _is_dataset_written_to_file(
                     candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file
@@ -147,3 +146,18 @@ def get_default_dataset_io_configurations(
                 )
 
                 yield dataset_io_configuration
+
+
+def get_default_backend_configuration(
+    nwbfile: NWBFile, backend: Literal["hdf5", "zarr"]
+) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
+    """Fill a default backend configuration to serve as a starting point for further customization."""
+    BackendConfigurationClass = BACKEND_TO_CONFIGURATION[backend]
+    default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)
+    dataset_configurations = {
+        default_dataset_configuration.dataset_info.location: default_dataset_configuration
+        for default_dataset_configuration in default_dataset_configurations
+    }
+
+    backend_configuration = BackendConfigurationClass(dataset_configurations=dataset_configurations)
+    return backend_configuration
diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py
@@ -171,7 +171,7 @@ def __str__(self) -> str:
             # TODO: add nicer auto-selection/rendering of units and amount for source data size
             "\n"
             f"\n  buffer shape : {self.buffer_shape}"
-            f"\n  maximum RAM usage per iteration : {maximum_ram_usage_per_iteration_in_gb:0.2f} GB"
+            f"\n  expected RAM usage : {maximum_ram_usage_per_iteration_in_gb:0.2f} GB"
             "\n"
             f"\n  chunk shape : {self.chunk_shape}"
             f"\n  disk space usage per chunk : {disk_space_usage_per_chunk_in_mb:0.2f} MB"
diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py
@@ -0,0 +1,263 @@
+"""Integration tests for `get_default_backend_configuration`."""
+from io import StringIO
+from pathlib import Path
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+from hdmf_zarr import NWBZarrIO
+from pynwb import NWBHDF5IO, NWBFile
+from pynwb.testing.mock.base import mock_TimeSeries
+from pynwb.testing.mock.file import mock_NWBFile
+
+from neuroconv.tools.nwb_helpers import (
+    HDF5BackendConfiguration,
+    ZarrBackendConfiguration,
+    get_default_backend_configuration,
+    get_module,
+)
+
+
+def generate_complex_nwbfile() -> NWBFile:
+    nwbfile = mock_NWBFile()
+
+    raw_array = np.array([[1, 2, 3], [4, 5, 6]])
+    raw_time_series = mock_TimeSeries(name="RawTimeSeries", data=raw_array)
+    nwbfile.add_acquisition(raw_time_series)
+
+    number_of_trials = 10
+    for start_time, stop_time in zip(
+        np.linspace(start=0.0, stop=10.0, num=number_of_trials), np.linspace(start=1.0, stop=11.0, num=number_of_trials)
+    ):
+        nwbfile.add_trial(start_time=start_time, stop_time=stop_time)
+
+    ecephys_module = get_module(nwbfile=nwbfile, name="ecephys")
+    processed_array = np.array([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13.0, 14.0]])
+    processed_time_series = mock_TimeSeries(name="ProcessedTimeSeries", data=processed_array)
+    ecephys_module.add(processed_time_series)
+
+    return nwbfile
+
+
+@pytest.fixture(scope="session")
+def hdf5_nwbfile_path(tmpdir_factory):
+    nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.h5")
+    if not Path(nwbfile_path).exists():
+        nwbfile = generate_complex_nwbfile()
+        with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io:
+            io.write(nwbfile)
+    return str(nwbfile_path)
+
+
+@pytest.fixture(scope="session")
+def zarr_nwbfile_path(tmpdir_factory):
+    nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.zarr")
+    if not Path(nwbfile_path).exists():
+        nwbfile = generate_complex_nwbfile()
+        with NWBZarrIO(path=str(nwbfile_path), mode="w") as io:
+            io.write(nwbfile)
+    return str(nwbfile_path)
+
+
+def test_complex_hdf5(hdf5_nwbfile_path):
+    with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io:
+        nwbfile = io.read()
+
+        raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32")
+        raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array)
+        nwbfile.add_acquisition(raw_time_series)
+
+        number_of_epochs = 5
+        for start_time, stop_time in zip(
+            np.linspace(start=0.0, stop=10.0, num=number_of_epochs),
+            np.linspace(start=1.0, stop=11.0, num=number_of_epochs),
+        ):
+            nwbfile.add_epoch(start_time=start_time, stop_time=stop_time)
+
+        ecephys_module = get_module(nwbfile=nwbfile, name="ecephys")
+        processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]])
+        processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array)
+        ecephys_module.add(processed_time_series)
+
+        backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="hdf5")
+
+    assert isinstance(backend_configuration, HDF5BackendConfiguration)
+
+    dataset_configurations = backend_configuration.dataset_configurations
+    assert len(dataset_configurations) == 4
+    assert "acquisition/NewRawTimeSeries/data" in dataset_configurations
+    assert "epochs/start_time/data" in dataset_configurations
+    assert "epochs/stop_time/data" in dataset_configurations
+    assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations
+
+    # Best summary test of expected output is the printout
+    with patch("sys.stdout", new=StringIO()) as stdout:
+        print(backend_configuration)
+
+    expected_print = """
+Configurable datasets identified using the hdf5 backend
+-------------------------------------------------------
+
+epochs/start_time/data
+----------------------
+  dtype : float64
+  full shape of source array : (5,)
+  full size of source array : 0.00 GB
+
+  buffer shape : (5,)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (5,)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+
+epochs/stop_time/data
+---------------------
+  dtype : float64
+  full shape of source array : (5,)
+  full size of source array : 0.00 GB
+
+  buffer shape : (5,)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (5,)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+
+acquisition/NewRawTimeSeries/data
+---------------------------------
+  dtype : int32
+  full shape of source array : (2, 3)
+  full size of source array : 0.00 GB
+
+  buffer shape : (2, 3)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (2, 3)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+
+processing/ecephys/NewProcessedTimeSeries/data
+----------------------------------------------
+  dtype : float64
+  full shape of source array : (4, 2)
+  full size of source array : 0.00 GB
+
+  buffer shape : (4, 2)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (4, 2)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+"""
+    assert stdout.getvalue() == expected_print
+
+
+def test_complex_zarr(zarr_nwbfile_path):
+    with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io:
+        nwbfile = io.read()
+
+        raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32")
+        raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array)
+        nwbfile.add_acquisition(raw_time_series)
+
+        number_of_epochs = 5
+        for start_time, stop_time in zip(
+            np.linspace(start=0.0, stop=10.0, num=number_of_epochs),
+            np.linspace(start=1.0, stop=11.0, num=number_of_epochs),
+        ):
+            nwbfile.add_epoch(start_time=start_time, stop_time=stop_time)
+
+        ecephys_module = get_module(nwbfile=nwbfile, name="ecephys")
+        processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]])
+        processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array)
+        ecephys_module.add(processed_time_series)
+
+        backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="zarr")
+
+    assert isinstance(backend_configuration, ZarrBackendConfiguration)
+
+    dataset_configurations = backend_configuration.dataset_configurations
+    assert len(dataset_configurations) == 4
+    assert "acquisition/NewRawTimeSeries/data" in dataset_configurations
+    assert "epochs/start_time/data" in dataset_configurations
+    assert "epochs/stop_time/data" in dataset_configurations
+    assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations
+
+    # Best summary test of expected output is the printout
+    with patch("sys.stdout", new=StringIO()) as stdout:
+        print(backend_configuration)
+
+    expected_print = """
+Configurable datasets identified using the zarr backend
+-------------------------------------------------------
+
+epochs/start_time/data
+----------------------
+  dtype : float64
+  full shape of source array : (5,)
+  full size of source array : 0.00 GB
+
+  buffer shape : (5,)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (5,)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+
+epochs/stop_time/data
+---------------------
+  dtype : float64
+  full shape of source array : (5,)
+  full size of source array : 0.00 GB
+
+  buffer shape : (5,)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (5,)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+
+acquisition/NewRawTimeSeries/data
+---------------------------------
+  dtype : int32
+  full shape of source array : (2, 3)
+  full size of source array : 0.00 GB
+
+  buffer shape : (2, 3)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (2, 3)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+
+processing/ecephys/NewProcessedTimeSeries/data
+----------------------------------------------
+  dtype : float64
+  full shape of source array : (4, 2)
+  full size of source array : 0.00 GB
+
+  buffer shape : (4, 2)
+  expected RAM usage : 0.00 GB
+
+  chunk shape : (4, 2)
+  disk space usage per chunk : 0.00 MB
+
+  compression method : gzip
+
+"""
+    assert stdout.getvalue() == expected_print
diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py
@@ -23,7 +23,7 @@ def test_hdf5_backend_configuration_print():
   full size of source array : 1.38 GB
 
   buffer shape : (1250000, 384)
-  maximum RAM usage per iteration : 0.96 GB
+  expected RAM usage : 0.96 GB
 
   chunk shape : (78125, 64)
   disk space usage per chunk : 10.00 MB
@@ -38,7 +38,7 @@ def test_hdf5_backend_configuration_print():
   full size of source array : 0.06 GB
 
   buffer shape : (75000, 384)
-  maximum RAM usage per iteration : 0.06 GB
+  expected RAM usage : 0.06 GB
 
   chunk shape : (37500, 128)
   disk space usage per chunk : 9.60 MB
diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py
@@ -21,7 +21,7 @@ def test_hdf5_dataset_configuration_print():
   full size of source array : 1.38 GB
 
   buffer shape : (1250000, 384)
-  maximum RAM usage per iteration : 0.96 GB
+  expected RAM usage : 0.96 GB
 
   chunk shape : (78125, 64)
   disk space usage per chunk : 10.00 MB
@@ -47,7 +47,7 @@ def test_hdf5_dataset_configuration_print_with_compression_options():
   full size of source array : 1.38 GB
 
   buffer shape : (1250000, 384)
-  maximum RAM usage per iteration : 0.96 GB
+  expected RAM usage : 0.96 GB
 
   chunk shape : (78125, 64)
   disk space usage per chunk : 10.00 MB
@@ -74,7 +74,7 @@ def test_hdf5_dataset_configuration_print_with_compression_disabled():
   full size of source array : 1.38 GB
 
   buffer shape : (1250000, 384)
-  maximum RAM usage per iteration : 0.96 GB
+  expected RAM usage : 0.96 GB
 
   chunk shape : (78125, 64)
   disk space usage per chunk : 10.00 MB
diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py
@@ -23,7 +23,7 @@ def test_zarr_backend_configuration_print():
   full size of source array : 1.38 GB
 
   buffer shape : (1250000, 384)
-  maximum RAM usage per iteration : 0.96 GB
+  expected RAM usage : 0.96 GB
 
   chunk shape : (78125, 64)
   disk space usage per chunk : 10.00 MB
@@ -40,7 +40,7 @@ def test_zarr_backend_configuration_print():
   full size of source array : 0.06 GB
 
   buffer shape : (75000, 384)
-  maximum RAM usage per iteration : 0.06 GB
+  expected RAM usage : 0.06 GB
 
   chunk shape : (37500, 128)
   disk space usage per chunk : 9.60 MB
diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py