Skip to content

Commit a4b157f

Browse files
Merge pull request #570 from catalystneuro/new_backend_add_backend_configuration_tool
[Backend Configuration IIb] Add backend collection tools
2 parents f44c06a + df5a1a4 commit a4b157f

File tree

9 files changed

+297
-15
lines changed

9 files changed

+297
-15
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
The name of the plane segmentation is used to determine which traces to add to the `Fluorescence` and `DfOverF` containers. [PR #632](https://github.com/catalystneuro/neuroconv/pull/632)
77
* Modify the filtering of traces to also filter out traces with empty values. [PR #649](https://github.com/catalystneuro/neuroconv/pull/649)
88
* Added tool function `get_default_dataset_configurations` for identifying and collecting all fields of an in-memory `NWBFile` that could become datasets on disk; and return instances of the Pydantic dataset models filled with default values for chunking/buffering/compression. [PR #569](https://github.com/catalystneuro/neuroconv/pull/569)
9+
* Added tool function `get_default_backend_configuration` for conveniently packaging the results of `get_default_dataset_configurations` into an easy-to-modify mapping from locations of objects within the file to their correseponding dataset configuration options, as well as linking to a specific backend DataIO. [PR #570](https://github.com/catalystneuro/neuroconv/pull/570)
910

1011
### Fixes
1112
* Fixed GenericDataChunkIterator (in hdmf.py) in the case where the number of dimensions is 1 and the size in bytes is greater than the threshold of 1 GB. [PR #638](https://github.com/catalystneuro/neuroconv/pull/638)

src/neuroconv/tools/nwb_helpers/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
from ._dataset_configuration import get_default_dataset_io_configurations
1+
"""Collection of Pydantic models and helper functions for configuring dataset IO parameters for different backends."""
2+
from ._dataset_configuration import (
3+
get_default_backend_configuration,
4+
get_default_dataset_io_configurations,
5+
)
26
from ._metadata_and_file_helpers import (
37
add_device_from_metadata,
48
get_default_nwbfile_metadata,

src/neuroconv/tools/nwb_helpers/_dataset_configuration.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ def get_default_dataset_io_configurations(
103103
dynamic_table = neurodata_object # for readability
104104

105105
for column in dynamic_table.columns:
106-
column_name = column.name
107106
candidate_dataset = column.data # VectorData object
108107
if _is_dataset_written_to_file(
109108
candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file
@@ -147,3 +146,18 @@ def get_default_dataset_io_configurations(
147146
)
148147

149148
yield dataset_io_configuration
149+
150+
151+
def get_default_backend_configuration(
152+
nwbfile: NWBFile, backend: Literal["hdf5", "zarr"]
153+
) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
154+
"""Fill a default backend configuration to serve as a starting point for further customization."""
155+
BackendConfigurationClass = BACKEND_TO_CONFIGURATION[backend]
156+
default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)
157+
dataset_configurations = {
158+
default_dataset_configuration.dataset_info.location: default_dataset_configuration
159+
for default_dataset_configuration in default_dataset_configurations
160+
}
161+
162+
backend_configuration = BackendConfigurationClass(dataset_configurations=dataset_configurations)
163+
return backend_configuration

src/neuroconv/tools/nwb_helpers/_models/_base_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def __str__(self) -> str:
171171
# TODO: add nicer auto-selection/rendering of units and amount for source data size
172172
"\n"
173173
f"\n buffer shape : {self.buffer_shape}"
174-
f"\n maximum RAM usage per iteration : {maximum_ram_usage_per_iteration_in_gb:0.2f} GB"
174+
f"\n expected RAM usage : {maximum_ram_usage_per_iteration_in_gb:0.2f} GB"
175175
"\n"
176176
f"\n chunk shape : {self.chunk_shape}"
177177
f"\n disk space usage per chunk : {disk_space_usage_per_chunk_in_mb:0.2f} MB"
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
"""Integration tests for `get_default_backend_configuration`."""
2+
from io import StringIO
3+
from pathlib import Path
4+
from unittest.mock import patch
5+
6+
import numpy as np
7+
import pytest
8+
from hdmf_zarr import NWBZarrIO
9+
from pynwb import NWBHDF5IO, NWBFile
10+
from pynwb.testing.mock.base import mock_TimeSeries
11+
from pynwb.testing.mock.file import mock_NWBFile
12+
13+
from neuroconv.tools.nwb_helpers import (
14+
HDF5BackendConfiguration,
15+
ZarrBackendConfiguration,
16+
get_default_backend_configuration,
17+
get_module,
18+
)
19+
20+
21+
def generate_complex_nwbfile() -> NWBFile:
22+
nwbfile = mock_NWBFile()
23+
24+
raw_array = np.array([[1, 2, 3], [4, 5, 6]])
25+
raw_time_series = mock_TimeSeries(name="RawTimeSeries", data=raw_array)
26+
nwbfile.add_acquisition(raw_time_series)
27+
28+
number_of_trials = 10
29+
for start_time, stop_time in zip(
30+
np.linspace(start=0.0, stop=10.0, num=number_of_trials), np.linspace(start=1.0, stop=11.0, num=number_of_trials)
31+
):
32+
nwbfile.add_trial(start_time=start_time, stop_time=stop_time)
33+
34+
ecephys_module = get_module(nwbfile=nwbfile, name="ecephys")
35+
processed_array = np.array([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13.0, 14.0]])
36+
processed_time_series = mock_TimeSeries(name="ProcessedTimeSeries", data=processed_array)
37+
ecephys_module.add(processed_time_series)
38+
39+
return nwbfile
40+
41+
42+
@pytest.fixture(scope="session")
43+
def hdf5_nwbfile_path(tmpdir_factory):
44+
nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.h5")
45+
if not Path(nwbfile_path).exists():
46+
nwbfile = generate_complex_nwbfile()
47+
with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io:
48+
io.write(nwbfile)
49+
return str(nwbfile_path)
50+
51+
52+
@pytest.fixture(scope="session")
53+
def zarr_nwbfile_path(tmpdir_factory):
54+
nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.zarr")
55+
if not Path(nwbfile_path).exists():
56+
nwbfile = generate_complex_nwbfile()
57+
with NWBZarrIO(path=str(nwbfile_path), mode="w") as io:
58+
io.write(nwbfile)
59+
return str(nwbfile_path)
60+
61+
62+
def test_complex_hdf5(hdf5_nwbfile_path):
63+
with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io:
64+
nwbfile = io.read()
65+
66+
raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32")
67+
raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array)
68+
nwbfile.add_acquisition(raw_time_series)
69+
70+
number_of_epochs = 5
71+
for start_time, stop_time in zip(
72+
np.linspace(start=0.0, stop=10.0, num=number_of_epochs),
73+
np.linspace(start=1.0, stop=11.0, num=number_of_epochs),
74+
):
75+
nwbfile.add_epoch(start_time=start_time, stop_time=stop_time)
76+
77+
ecephys_module = get_module(nwbfile=nwbfile, name="ecephys")
78+
processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]])
79+
processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array)
80+
ecephys_module.add(processed_time_series)
81+
82+
backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="hdf5")
83+
84+
assert isinstance(backend_configuration, HDF5BackendConfiguration)
85+
86+
dataset_configurations = backend_configuration.dataset_configurations
87+
assert len(dataset_configurations) == 4
88+
assert "acquisition/NewRawTimeSeries/data" in dataset_configurations
89+
assert "epochs/start_time/data" in dataset_configurations
90+
assert "epochs/stop_time/data" in dataset_configurations
91+
assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations
92+
93+
# Best summary test of expected output is the printout
94+
with patch("sys.stdout", new=StringIO()) as stdout:
95+
print(backend_configuration)
96+
97+
expected_print = """
98+
Configurable datasets identified using the hdf5 backend
99+
-------------------------------------------------------
100+
101+
epochs/start_time/data
102+
----------------------
103+
dtype : float64
104+
full shape of source array : (5,)
105+
full size of source array : 0.00 GB
106+
107+
buffer shape : (5,)
108+
expected RAM usage : 0.00 GB
109+
110+
chunk shape : (5,)
111+
disk space usage per chunk : 0.00 MB
112+
113+
compression method : gzip
114+
115+
116+
epochs/stop_time/data
117+
---------------------
118+
dtype : float64
119+
full shape of source array : (5,)
120+
full size of source array : 0.00 GB
121+
122+
buffer shape : (5,)
123+
expected RAM usage : 0.00 GB
124+
125+
chunk shape : (5,)
126+
disk space usage per chunk : 0.00 MB
127+
128+
compression method : gzip
129+
130+
131+
acquisition/NewRawTimeSeries/data
132+
---------------------------------
133+
dtype : int32
134+
full shape of source array : (2, 3)
135+
full size of source array : 0.00 GB
136+
137+
buffer shape : (2, 3)
138+
expected RAM usage : 0.00 GB
139+
140+
chunk shape : (2, 3)
141+
disk space usage per chunk : 0.00 MB
142+
143+
compression method : gzip
144+
145+
146+
processing/ecephys/NewProcessedTimeSeries/data
147+
----------------------------------------------
148+
dtype : float64
149+
full shape of source array : (4, 2)
150+
full size of source array : 0.00 GB
151+
152+
buffer shape : (4, 2)
153+
expected RAM usage : 0.00 GB
154+
155+
chunk shape : (4, 2)
156+
disk space usage per chunk : 0.00 MB
157+
158+
compression method : gzip
159+
160+
"""
161+
assert stdout.getvalue() == expected_print
162+
163+
164+
def test_complex_zarr(zarr_nwbfile_path):
165+
with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io:
166+
nwbfile = io.read()
167+
168+
raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32")
169+
raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array)
170+
nwbfile.add_acquisition(raw_time_series)
171+
172+
number_of_epochs = 5
173+
for start_time, stop_time in zip(
174+
np.linspace(start=0.0, stop=10.0, num=number_of_epochs),
175+
np.linspace(start=1.0, stop=11.0, num=number_of_epochs),
176+
):
177+
nwbfile.add_epoch(start_time=start_time, stop_time=stop_time)
178+
179+
ecephys_module = get_module(nwbfile=nwbfile, name="ecephys")
180+
processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]])
181+
processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array)
182+
ecephys_module.add(processed_time_series)
183+
184+
backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="zarr")
185+
186+
assert isinstance(backend_configuration, ZarrBackendConfiguration)
187+
188+
dataset_configurations = backend_configuration.dataset_configurations
189+
assert len(dataset_configurations) == 4
190+
assert "acquisition/NewRawTimeSeries/data" in dataset_configurations
191+
assert "epochs/start_time/data" in dataset_configurations
192+
assert "epochs/stop_time/data" in dataset_configurations
193+
assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations
194+
195+
# Best summary test of expected output is the printout
196+
with patch("sys.stdout", new=StringIO()) as stdout:
197+
print(backend_configuration)
198+
199+
expected_print = """
200+
Configurable datasets identified using the zarr backend
201+
-------------------------------------------------------
202+
203+
epochs/start_time/data
204+
----------------------
205+
dtype : float64
206+
full shape of source array : (5,)
207+
full size of source array : 0.00 GB
208+
209+
buffer shape : (5,)
210+
expected RAM usage : 0.00 GB
211+
212+
chunk shape : (5,)
213+
disk space usage per chunk : 0.00 MB
214+
215+
compression method : gzip
216+
217+
218+
epochs/stop_time/data
219+
---------------------
220+
dtype : float64
221+
full shape of source array : (5,)
222+
full size of source array : 0.00 GB
223+
224+
buffer shape : (5,)
225+
expected RAM usage : 0.00 GB
226+
227+
chunk shape : (5,)
228+
disk space usage per chunk : 0.00 MB
229+
230+
compression method : gzip
231+
232+
233+
acquisition/NewRawTimeSeries/data
234+
---------------------------------
235+
dtype : int32
236+
full shape of source array : (2, 3)
237+
full size of source array : 0.00 GB
238+
239+
buffer shape : (2, 3)
240+
expected RAM usage : 0.00 GB
241+
242+
chunk shape : (2, 3)
243+
disk space usage per chunk : 0.00 MB
244+
245+
compression method : gzip
246+
247+
248+
processing/ecephys/NewProcessedTimeSeries/data
249+
----------------------------------------------
250+
dtype : float64
251+
full shape of source array : (4, 2)
252+
full size of source array : 0.00 GB
253+
254+
buffer shape : (4, 2)
255+
expected RAM usage : 0.00 GB
256+
257+
chunk shape : (4, 2)
258+
disk space usage per chunk : 0.00 MB
259+
260+
compression method : gzip
261+
262+
"""
263+
assert stdout.getvalue() == expected_print

tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_hdf5_backend_configuration_print():
2323
full size of source array : 1.38 GB
2424
2525
buffer shape : (1250000, 384)
26-
maximum RAM usage per iteration : 0.96 GB
26+
expected RAM usage : 0.96 GB
2727
2828
chunk shape : (78125, 64)
2929
disk space usage per chunk : 10.00 MB
@@ -38,7 +38,7 @@ def test_hdf5_backend_configuration_print():
3838
full size of source array : 0.06 GB
3939
4040
buffer shape : (75000, 384)
41-
maximum RAM usage per iteration : 0.06 GB
41+
expected RAM usage : 0.06 GB
4242
4343
chunk shape : (37500, 128)
4444
disk space usage per chunk : 9.60 MB

tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_hdf5_dataset_configuration_print():
2121
full size of source array : 1.38 GB
2222
2323
buffer shape : (1250000, 384)
24-
maximum RAM usage per iteration : 0.96 GB
24+
expected RAM usage : 0.96 GB
2525
2626
chunk shape : (78125, 64)
2727
disk space usage per chunk : 10.00 MB
@@ -47,7 +47,7 @@ def test_hdf5_dataset_configuration_print_with_compression_options():
4747
full size of source array : 1.38 GB
4848
4949
buffer shape : (1250000, 384)
50-
maximum RAM usage per iteration : 0.96 GB
50+
expected RAM usage : 0.96 GB
5151
5252
chunk shape : (78125, 64)
5353
disk space usage per chunk : 10.00 MB
@@ -74,7 +74,7 @@ def test_hdf5_dataset_configuration_print_with_compression_disabled():
7474
full size of source array : 1.38 GB
7575
7676
buffer shape : (1250000, 384)
77-
maximum RAM usage per iteration : 0.96 GB
77+
expected RAM usage : 0.96 GB
7878
7979
chunk shape : (78125, 64)
8080
disk space usage per chunk : 10.00 MB

tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_zarr_backend_configuration_print():
2323
full size of source array : 1.38 GB
2424
2525
buffer shape : (1250000, 384)
26-
maximum RAM usage per iteration : 0.96 GB
26+
expected RAM usage : 0.96 GB
2727
2828
chunk shape : (78125, 64)
2929
disk space usage per chunk : 10.00 MB
@@ -40,7 +40,7 @@ def test_zarr_backend_configuration_print():
4040
full size of source array : 0.06 GB
4141
4242
buffer shape : (75000, 384)
43-
maximum RAM usage per iteration : 0.06 GB
43+
expected RAM usage : 0.06 GB
4444
4545
chunk shape : (37500, 128)
4646
disk space usage per chunk : 9.60 MB

0 commit comments

Comments
 (0)