Skip to content

Commit 9e194a4

Browse files
bendichterrly
andauthored
change chunk default size to 10MB (#925)
Co-authored-by: Ryan Ly <[email protected]>
1 parent 64a444f commit 9e194a4

File tree

3 files changed

+12
-11
lines changed

3 files changed

+12
-11
lines changed

src/hdmf/backends/hdf5/h5tools.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
H5_REF = special_dtype(ref=Reference)
3030
H5_REGREF = special_dtype(ref=RegionReference)
3131

32+
RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB
33+
3234
H5PY_3 = h5py.__version__.startswith('3')
3335

3436

@@ -745,7 +747,7 @@ def __read_ref(self, h5obj):
745747
def open(self):
746748
if self.__file is None:
747749
open_flag = self.__mode
748-
kwargs = dict()
750+
kwargs = dict(rdcc_nbytes=RDCC_NBYTES)
749751
if self.comm:
750752
kwargs.update(driver='mpio', comm=self.comm)
751753

src/hdmf/data_utils.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ class GenericDataChunkIterator(AbstractDataChunkIterator):
154154
doc=(
155155
"If chunk_shape is not specified, it will be inferred as the smallest chunk "
156156
"below the chunk_mb threshold.",
157-
"Defaults to 1MB.",
157+
"Defaults to 10MB.",
158158
),
159159
default=None,
160160
),
@@ -187,9 +187,8 @@ def __init__(self, **kwargs):
187187
Advanced users are offered full control over the shape parameters for the buffer and the chunks; however,
188188
the chunk shape must perfectly divide the buffer shape along each axis.
189189
190-
HDF5 also recommends not setting chunk_mb greater than 1 MB for optimal caching speeds.
191-
See https://support.hdfgroup.org/HDF5/doc/TechNotes/TechNote-HDF5-ImprovingIOPerformanceCompressedDatasets.pdf
192-
for more details.
190+
HDF5 recommends chunk size in the range of 2 to 16 MB for optimal cloud performance.
191+
https://youtu.be/rcS5vt-mKok?t=621
193192
"""
194193
buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, self.progress_bar_options = getargs(
195194
"buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs
@@ -198,7 +197,7 @@ def __init__(self, **kwargs):
198197
if buffer_gb is None and buffer_shape is None:
199198
buffer_gb = 1.0
200199
if chunk_mb is None and chunk_shape is None:
201-
chunk_mb = 1.0
200+
chunk_mb = 10.0
202201
assert (buffer_gb is not None) != (
203202
buffer_shape is not None
204203
), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!"

tests/unit/utils_test/test_core_GenericDataChunkIterator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -277,17 +277,17 @@ def test_numpy_array_chunk_iterator(self):
277277

278278
def test_buffer_shape_option(self):
279279
expected_buffer_shape = (1580, 316)
280-
iterator_options = dict(buffer_shape=expected_buffer_shape)
280+
iterator_options = dict(buffer_shape=expected_buffer_shape, chunk_mb=1.0)
281281
self.check_first_data_chunk_call(
282282
expected_selection=tuple([slice(0, buffer_shape_axis) for buffer_shape_axis in expected_buffer_shape]),
283283
iterator_options=iterator_options,
284284
)
285285
self.check_direct_hdf5_write(iterator_options=iterator_options)
286286

287287
def test_buffer_gb_option(self):
288-
# buffer is smaller than default chunk; should collapse to chunk shape
288+
# buffer is smaller than chunk; should collapse to chunk shape
289289
resulting_buffer_shape = (1580, 316)
290-
iterator_options = dict(buffer_gb=0.0005)
290+
iterator_options = dict(buffer_gb=0.0005, chunk_mb=1.0)
291291
self.check_first_data_chunk_call(
292292
expected_selection=tuple(
293293
[
@@ -334,14 +334,14 @@ def test_chunk_mb_option_while_condition(self):
334334
"""Test to evoke while condition of default shaping method."""
335335
expected_chunk_shape = (2, 79, 79)
336336
special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(2, 2000, 2000), dtype="int16")
337-
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array)
337+
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0)
338338
self.assertEqual(iterator.chunk_shape, expected_chunk_shape)
339339

340340
def test_chunk_mb_option_while_condition_unit_maxshape_axis(self):
341341
"""Test to evoke while condition of default shaping method."""
342342
expected_chunk_shape = (1, 79, 79)
343343
special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(1, 2000, 2000), dtype="int16")
344-
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array)
344+
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0)
345345
self.assertEqual(iterator.chunk_shape, expected_chunk_shape)
346346

347347
@unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed")

0 commit comments

Comments
 (0)