Skip to content

Error Too many open files when using LocalFileSystem #841

@observingClouds

Description

@observingClouds

Hi 👋,

I like to open a local zarr file with Virtualizarr and the ZarrParser following the TestOpenVirtualDatasetZarr. While the test-suite of this package runs without issues, the following minimal example does raise a Too many open files error.

Minimal reproducible example

# Create dataset
import xarray as xr
import dask.array as da
import numpy as np

time_size, x_size, y_size = 1, 500, 500
data_a = da.random.random((time_size, x_size, y_size), chunks=(1, 5, 5))
data_b = da.random.random((time_size, x_size, y_size), chunks=(1, 5, 5))

# Create the xarray dataset
dataset = xr.Dataset(
    {
        "a": (("time", "x", "y"), data_a),
        "b": (("time", "x", "y"), data_b),
    },
    coords={
        "time": np.arange(time_size),
        "x": np.arange(x_size),
        "y": np.arange(y_size),
    }
)
dataset.to_zarr("test.zarr")

# Open with Virtualizarr
from virtualizarr import open_virtual_dataset
from virtualizarr.parsers import ZarrParser
from obstore.store import LocalStore
from virtualizarr.registry import ObjectStoreRegistry
from pathlib import Path

zarr_store = str(Path.cwd() / 'test.zarr')
store = LocalStore(prefix=zarr_store)
registry = ObjectStoreRegistry({f"file://{zarr_store}": store})
parser = ZarrParser()
vds = open_virtual_dataset( url=zarr_store,registry=registry,parser=parser)

Results in:

GenericError: Generic LocalFileSystem error: Unable to open file /Users/XXXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68: Too many open files (os error 24)

Debug source:
Generic {
    store: "LocalFileSystem",
    source: UnableToOpenFile {
        source: Os {
            code: 24,
            kind: Uncategorized,
            message: "Too many open files",
        },
        path: "/Users/XXXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68",
    },
}
Full traceback
---------------------------------------------------------------------------
GenericError                              Traceback (most recent call last)
Cell In[1], line 35
     33 registry = ObjectStoreRegistry({f"file://{zarr_store}": store})
     34 parser = ZarrParser()
---> 35 vds = open_virtual_dataset( url=zarr_store,registry=registry,parser=parser)

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/xarray.py:88, in open_virtual_dataset(url, registry, parser, drop_variables, loadable_variables, decode_times)
     46 """
     47 Open an archival data source as an [xarray.Dataset][] wrapping virtualized zarr arrays.
     48 
   (...)     84     in `loadable_variables` and normal lazily indexed arrays for each variable in `loadable_variables`.
     85 """
     86 filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri())
---> 88 manifest_store = parser(
     89     url=filepath,
     90     registry=registry,
     91 )
     93 ds = manifest_store.to_virtual_dataset(
     94     loadable_variables=loadable_variables,
     95     decode_times=decode_times,
     96 )
     97 return ds.drop_vars(list(drop_variables or ()))

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:515, in ZarrParser.__call__(self, url, registry)
    513 object_store, _ = registry.resolve(path)
    514 zarr_store = ObjectStore(store=object_store)
--> 515 manifest_group = asyncio.run(
    516     _construct_manifest_group(
    517         store=zarr_store,
    518         path=url,
    519         group=self.group,
    520         skip_variables=self.skip_variables,
    521     )
    522 )
    523 return ManifestStore(registry=registry, group=manifest_group)

File ~/.local/share/uv/python/cpython-3.14.1-macos-x86_64-none/lib/python3.14/asyncio/runners.py:204, in run(main, debug, loop_factory)
    200     raise RuntimeError(
    201         "asyncio.run() cannot be called from a running event loop")
    203 with Runner(debug=debug, loop_factory=loop_factory) as runner:
--> 204     return runner.run(main)

File ~/.local/share/uv/python/cpython-3.14.1-macos-x86_64-none/lib/python3.14/asyncio/runners.py:127, in Runner.run(self, coro, context)
    125 self._interrupt_count = 0
    126 try:
--> 127     return self._loop.run_until_complete(task)
    128 except exceptions.CancelledError:
    129     if self._interrupt_count > 0:

File ~/.local/share/uv/python/cpython-3.14.1-macos-x86_64-none/lib/python3.14/asyncio/base_events.py:719, in BaseEventLoop.run_until_complete(self, future)
    716 if not future.done():
    717     raise RuntimeError('Event loop stopped before Future completed.')
--> 719 return future.result()

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:394, in _construct_manifest_group(path, store, skip_variables, group)
    384 _skip_variables = [] if skip_variables is None else list(skip_variables)
    386 zarr_arrays = await asyncio.gather(
    387     *[
    388         zarr_group.getitem(var)
   (...)    391     ]
    392 )
--> 394 manifest_arrays = await asyncio.gather(
    395     *[_construct_manifest_array(array, path) for array in zarr_arrays]  # type: ignore[arg-type]
    396 )
    398 manifest_dict = {
    399     array.basename: result for array, result in zip(zarr_arrays, manifest_arrays)
    400 }
    402 manifest_group = ManifestGroup(manifest_dict, attributes=zarr_group.attrs)

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:369, in _construct_manifest_array(zarr_array, path)
    367 """Construct a ManifestArray from a zarr array."""
    368 array_metadata = get_metadata(zarr_array)
--> 369 chunk_manifest = await build_chunk_manifest(zarr_array, path)
    370 return ManifestArray(metadata=array_metadata, chunkmanifest=chunk_manifest)

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:332, in build_chunk_manifest(zarr_array, path)
    324 """Build a ChunkManifest from chunk coordinate mappings.
    325 
    326 Note: Chunk keys are discovered by listing what's actually in storage rather than
   (...)    329 missing, Zarr will return the fill_value for those regions when the array is read.
    330 """
    331 strategy = get_strategy(zarr_array)
--> 332 chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
    334 if not chunk_map:
    335     import math

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:288, in ZarrV3Strategy.get_chunk_mapping(self, zarr_array, path)
    285     return {}
    287 chunk_keys = [x[0] for x in prefix_keys]
--> 288 return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/parsers/zarr.py:117, in _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
    114 if not chunk_keys:
    115     return {}
--> 117 lengths = await _concurrent_map(
    118     [(k,) for k in chunk_keys], zarr_array.store.getsize
    119 )
    120 dict_keys = _normalize_chunk_keys(chunk_keys, prefix)
    121 paths = [join_url(path, k) for k in chunk_keys]

File ~/Documents/GitHub/VirtualiZarr/virtualizarr/vendor/zarr/core/common.py:23, in _concurrent_map(items, func, limit)
     17 async def _concurrent_map(
     18     items: Iterable[T],
     19     func: Callable[..., Awaitable[V]],
     20     limit: int | None = None,
     21 ) -> list[V]:
     22     if limit is None:
---> 23         return await asyncio.gather(*list(starmap(func, items)))
     25     else:
     26         sem = asyncio.Semaphore(limit)

File ~/Documents/GitHub/VirtualiZarr/.venv/lib/python3.14/site-packages/zarr/storage/_obstore.py:246, in ObjectStore.getsize(self, key)
    242 async def getsize(self, key: str) -> int:
    243     # docstring inherited
    244     import obstore as obs
--> 246     resp = await obs.head_async(self.store, key)
    247     return resp["size"]

GenericError: Generic LocalFileSystem error: Unable to open file /Users/XXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68: Too many open files (os error 24)

Debug source:
Generic {
    store: "LocalFileSystem",
    source: UnableToOpenFile {
        source: Os {
            code: 24,
            kind: Uncategorized,
            message: "Too many open files",
        },
        path: "/Users/XXXX/Documents/GitHub/VirtualiZarr/test.zarr/a/c/0/92/68",
    },
}

Expecation

I expected this to succeed in the same way as a dataset with less chunks.

Versions

virtualizarr       2.2.2.dev1+gcf8e0dbe6
xarray              2025.12.1.dev10+g3c6b050bf
fsspec             2025.12.0
obstore            0.8.2
zarr                  3.1.6.dev6+g65fec7142

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions