-
Notifications
You must be signed in to change notification settings - Fork 52
Open
Description
Hi 👋,
I like to open a local zarr file with Virtualizarr and the ZarrParser following the TestOpenVirtualDatasetZarr. While the test-suite of this package runs without issues, the following minimal example does raise a Too many open files error.
Minimal reproducible example
# Create dataset
import xarray as xr
import dask.array as da
import numpy as np
time_size, x_size, y_size = 1, 500, 500
data_a = da.random.random((time_size, x_size, y_size), chunks=(1, 5, 5))
data_b = da.random.random((time_size, x_size, y_size), chunks=(1, 5, 5))
# Create the xarray dataset
dataset = xr.Dataset(
{
"a": (("time", "x", "y"), data_a),
"b": (("time", "x", "y"), data_b),
},
coords={
"time": np.arange(time_size),
"x": np.arange(x_size),
"y": np.arange(y_size),
}
)
dataset.to_zarr("test.zarr")
# Open with Virtualizarr
from virtualizarr import open_virtual_dataset
from virtualizarr.parsers import ZarrParser
from obstore.store import LocalStore
from virtualizarr.registry import ObjectStoreRegistry
from pathlib import Path
zarr_store = str(Path.cwd() / 'test.zarr')
store = LocalStore(prefix=zarr_store)
registry = ObjectStoreRegistry({f"file://{zarr_store}": store})
parser = ZarrParser()
vds = open_virtual_dataset( url=zarr_store,registry=registry,parser=parser)Results in:
GenericError: Generic LocalFileSystem error: Unable to open file /Users/XXXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68: Too many open files (os error 24)
Debug source:
Generic {
store: "LocalFileSystem",
source: UnableToOpenFile {
source: Os {
code: 24,
kind: Uncategorized,
message: "Too many open files",
},
path: "/Users/XXXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68",
},
}
Full traceback
---------------------------------------------------------------------------
GenericError Traceback (most recent call last)
Cell In[1], line 35
33 registry = ObjectStoreRegistry({f"file://{zarr_store}": store})
34 parser = ZarrParser()
---> 35 vds = open_virtual_dataset( url=zarr_store,registry=registry,parser=parser)
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/xarray.py:88, in open_virtual_dataset(url, registry, parser, drop_variables, loadable_variables, decode_times)
46 """
47 Open an archival data source as an [xarray.Dataset][] wrapping virtualized zarr arrays.
48
(...) 84 in `loadable_variables` and normal lazily indexed arrays for each variable in `loadable_variables`.
85 """
86 filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri())
---> 88 manifest_store = parser(
89 url=filepath,
90 registry=registry,
91 )
93 ds = manifest_store.to_virtual_dataset(
94 loadable_variables=loadable_variables,
95 decode_times=decode_times,
96 )
97 return ds.drop_vars(list(drop_variables or ()))
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:515, in ZarrParser.__call__(self, url, registry)
513 object_store, _ = registry.resolve(path)
514 zarr_store = ObjectStore(store=object_store)
--> 515 manifest_group = asyncio.run(
516 _construct_manifest_group(
517 store=zarr_store,
518 path=url,
519 group=self.group,
520 skip_variables=self.skip_variables,
521 )
522 )
523 return ManifestStore(registry=registry, group=manifest_group)
File ~/.local/share/uv/python/cpython-3.14.1-macos-x86_64-none/lib/python3.14/asyncio/runners.py:204, in run(main, debug, loop_factory)
200 raise RuntimeError(
201 "asyncio.run() cannot be called from a running event loop")
203 with Runner(debug=debug, loop_factory=loop_factory) as runner:
--> 204 return runner.run(main)
File ~/.local/share/uv/python/cpython-3.14.1-macos-x86_64-none/lib/python3.14/asyncio/runners.py:127, in Runner.run(self, coro, context)
125 self._interrupt_count = 0
126 try:
--> 127 return self._loop.run_until_complete(task)
128 except exceptions.CancelledError:
129 if self._interrupt_count > 0:
File ~/.local/share/uv/python/cpython-3.14.1-macos-x86_64-none/lib/python3.14/asyncio/base_events.py:719, in BaseEventLoop.run_until_complete(self, future)
716 if not future.done():
717 raise RuntimeError('Event loop stopped before Future completed.')
--> 719 return future.result()
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:394, in _construct_manifest_group(path, store, skip_variables, group)
384 _skip_variables = [] if skip_variables is None else list(skip_variables)
386 zarr_arrays = await asyncio.gather(
387 *[
388 zarr_group.getitem(var)
(...) 391 ]
392 )
--> 394 manifest_arrays = await asyncio.gather(
395 *[_construct_manifest_array(array, path) for array in zarr_arrays] # type: ignore[arg-type]
396 )
398 manifest_dict = {
399 array.basename: result for array, result in zip(zarr_arrays, manifest_arrays)
400 }
402 manifest_group = ManifestGroup(manifest_dict, attributes=zarr_group.attrs)
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:369, in _construct_manifest_array(zarr_array, path)
367 """Construct a ManifestArray from a zarr array."""
368 array_metadata = get_metadata(zarr_array)
--> 369 chunk_manifest = await build_chunk_manifest(zarr_array, path)
370 return ManifestArray(metadata=array_metadata, chunkmanifest=chunk_manifest)
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:332, in build_chunk_manifest(zarr_array, path)
324 """Build a ChunkManifest from chunk coordinate mappings.
325
326 Note: Chunk keys are discovered by listing what's actually in storage rather than
(...) 329 missing, Zarr will return the fill_value for those regions when the array is read.
330 """
331 strategy = get_strategy(zarr_array)
--> 332 chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
334 if not chunk_map:
335 import math
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:288, in ZarrV3Strategy.get_chunk_mapping(self, zarr_array, path)
285 return {}
287 chunk_keys = [x[0] for x in prefix_keys]
--> 288 return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:117, in _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
114 if not chunk_keys:
115 return {}
--> 117 lengths = await _concurrent_map(
118 [(k,) for k in chunk_keys], zarr_array.store.getsize
119 )
120 dict_keys = _normalize_chunk_keys(chunk_keys, prefix)
121 paths = [join_url(path, k) for k in chunk_keys]
File ~/Documents/GitHub/VirtualiZarr/virtualizarr/vendor/zarr/core/common.py:23, in _concurrent_map(items, func, limit)
17 async def _concurrent_map(
18 items: Iterable[T],
19 func: Callable[..., Awaitable[V]],
20 limit: int | None = None,
21 ) -> list[V]:
22 if limit is None:
---> 23 return await asyncio.gather(*list(starmap(func, items)))
25 else:
26 sem = asyncio.Semaphore(limit)
File ~/Documents/GitHub/VirtualiZarr/.venv/lib/python3.14/site-packages/zarr/storage/_obstore.py:246, in ObjectStore.getsize(self, key)
242 async def getsize(self, key: str) -> int:
243 # docstring inherited
244 import obstore as obs
--> 246 resp = await obs.head_async(self.store, key)
247 return resp["size"]
GenericError: Generic LocalFileSystem error: Unable to open file /Users/XXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68: Too many open files (os error 24)
Debug source:
Generic {
store: "LocalFileSystem",
source: UnableToOpenFile {
source: Os {
code: 24,
kind: Uncategorized,
message: "Too many open files",
},
path: "/Users/XXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68",
},
}Expecation
I expected this to succeed in the same way as a dataset with less chunks.
Versions
virtualizarr 2.2.2.dev1+gcf8e0dbe6
xarray 2025.12.1.dev10+g3c6b050bf
fsspec 2025.12.0
obstore 0.8.2
zarr 3.1.6.dev6+g65fec7142
Metadata
Metadata
Assignees
Labels
No labels