Skip to content

Commit 54ac2fe

Browse files
pratiman-91headtr1ckpre-commit-ci[bot]kmuehlbauer
authored
add kwarg to handle invalid files in open_mfdataset (#9955)
* GH6736 * Updated whats-new.rst * Update xarray/backends/api.py Co-authored-by: Michael Niklas <[email protected]> * Updated logic * Added tests and modifiede the logic to get correct ids for concat * Added new tests and logic to handle 2x2 open_mfdataset with ignore and warn. * pre-commit run * new logic to add nested paths * made remove_path a private function and updated whats-new.rst * Updated whats-new.rst * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed entry to whats-new.rst * Remove conflict * Whats-new conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * modify docs * modify doc-strings * Update xarray/backends/api.py Co-authored-by: Kai Mühlbauer <[email protected]> * Update xarray/backends/api.py Co-authored-by: Kai Mühlbauer <[email protected]> * catch exception for warn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray/backends/api.py Co-authored-by: Kai Mühlbauer <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * import emit_user_level_warning * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * retry importing emit_user_level_warning * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * emit_user_level_warning * adding import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated whats-new.rst * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update doc/whats-new.rst * Update whats-new.rst * set of invalid files in a set and remove them only once * modified the logic to remove invalid files. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * import ing TypeVar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * making FLike private * fixing mypy errors * importing List from typing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated whats-new * remove whats-new * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated Whats-new.rst --------- Co-authored-by: Michael Niklas <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kai Mühlbauer <[email protected]>
1 parent ea9f02b commit 54ac2fe

File tree

3 files changed

+123
-3
lines changed

3 files changed

+123
-3
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ New Features
1515

1616
- :py:meth:`DataTree.to_netcdf` can now write to a file-like object, or return bytes if called without a filepath. (:issue:`10570`)
1717
By `Matthew Willson <https://github.com/mjwillson>`_.
18+
- Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`)
19+
By `Pratiman Patel <https://github.com/pratiman-91>`_.
1820

1921
Breaking changes
2022
~~~~~~~~~~~~~~~~

xarray/backends/api.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
Any,
2020
Final,
2121
Literal,
22+
TypeVar,
2223
Union,
2324
cast,
2425
overload,
@@ -45,8 +46,8 @@
4546
from xarray.core.datatree import DataTree
4647
from xarray.core.indexes import Index
4748
from xarray.core.treenode import group_subtrees
48-
from xarray.core.types import NetcdfWriteModes, ZarrWriteModes
49-
from xarray.core.utils import is_remote_uri
49+
from xarray.core.types import NetcdfWriteModes, ReadBuffer, ZarrWriteModes
50+
from xarray.core.utils import emit_user_level_warning, is_remote_uri
5051
from xarray.namedarray.daskmanager import DaskManager
5152
from xarray.namedarray.parallelcompat import guess_chunkmanager
5253
from xarray.structure.chunks import _get_chunk, _maybe_chunk
@@ -73,6 +74,7 @@
7374
from xarray.core.types import (
7475
CombineAttrsOptions,
7576
CompatOptions,
77+
ErrorOptionsWithWarn,
7678
JoinOptions,
7779
NestedSequence,
7880
ReadBuffer,
@@ -1459,6 +1461,28 @@ def open_groups(
14591461
return groups
14601462

14611463

1464+
_FLike = TypeVar("_FLike", bound=Union[str, ReadBuffer])
1465+
1466+
1467+
def _remove_path(
1468+
paths: NestedSequence[_FLike], paths_to_remove: set[_FLike]
1469+
) -> NestedSequence[_FLike]:
1470+
# Initialize an empty list to store the result
1471+
result: list[Union[_FLike, NestedSequence[_FLike]]] = []
1472+
1473+
for item in paths:
1474+
if isinstance(item, list):
1475+
# If the current item is a list, recursively call remove_elements on it
1476+
nested_result = _remove_path(item, paths_to_remove)
1477+
if nested_result: # Only add non-empty lists to avoid adding empty lists
1478+
result.append(nested_result)
1479+
elif item not in paths_to_remove:
1480+
# Add the item to the result if it is not in the set of elements to remove
1481+
result.append(item)
1482+
1483+
return result
1484+
1485+
14621486
def open_mfdataset(
14631487
paths: str
14641488
| os.PathLike
@@ -1487,6 +1511,7 @@ def open_mfdataset(
14871511
join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT,
14881512
attrs_file: str | os.PathLike | None = None,
14891513
combine_attrs: CombineAttrsOptions = "override",
1514+
errors: ErrorOptionsWithWarn = "raise",
14901515
**kwargs,
14911516
) -> Dataset:
14921517
"""Open multiple files as a single dataset.
@@ -1613,6 +1638,12 @@ def open_mfdataset(
16131638
16141639
If a callable, it must expect a sequence of ``attrs`` dicts and a context object
16151640
as its only parameters.
1641+
errors : {"raise", "warn", "ignore"}, default: "raise"
1642+
String indicating how to handle errors in opening dataset.
1643+
1644+
- "raise": invalid dataset will raise an exception.
1645+
- "warn": a warning will be issued for each invalid dataset.
1646+
- "ignore": invalid dataset will be ignored.
16161647
**kwargs : optional
16171648
Additional arguments passed on to :py:func:`xarray.open_dataset`. For an
16181649
overview of some of the possible options, see the documentation of
@@ -1705,7 +1736,32 @@ def open_mfdataset(
17051736
open_ = open_dataset
17061737
getattr_ = getattr
17071738

1708-
datasets = [open_(p, **open_kwargs) for p in paths1d]
1739+
if errors not in ("raise", "warn", "ignore"):
1740+
raise ValueError(
1741+
f"'errors' must be 'raise', 'warn' or 'ignore', got '{errors}'"
1742+
)
1743+
1744+
datasets = []
1745+
invalid_paths = set()
1746+
for p in paths1d:
1747+
try:
1748+
ds = open_(p, **open_kwargs)
1749+
datasets.append(ds)
1750+
except Exception as e:
1751+
if errors == "raise":
1752+
raise
1753+
elif errors == "warn":
1754+
emit_user_level_warning(f"Could not open {p} due to {e}. Ignoring.")
1755+
# remove invalid paths
1756+
invalid_paths.add(p)
1757+
1758+
if invalid_paths:
1759+
paths = _remove_path(paths, invalid_paths)
1760+
if combine == "nested":
1761+
# Create new ids and paths based on removed items
1762+
combined_ids_paths = _infer_concat_order_from_positions(paths)
1763+
ids = list(combined_ids_paths.keys())
1764+
17091765
closers = [getattr_(ds, "_close") for ds in datasets]
17101766
if preprocess is not None:
17111767
datasets = [preprocess(ds) for ds in datasets]

xarray/tests/test_backends.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5371,6 +5371,68 @@ def test_open_mfdataset_2(self) -> None:
53715371
) as actual:
53725372
assert_identical(original, actual)
53735373

5374+
def test_open_mfdataset_with_ignore(self) -> None:
5375+
original = Dataset({"foo": ("x", np.random.randn(10))})
5376+
with create_tmp_files(2) as (tmp1, tmp2):
5377+
ds1 = original.isel(x=slice(5))
5378+
ds2 = original.isel(x=slice(5, 10))
5379+
ds1.to_netcdf(tmp1)
5380+
ds2.to_netcdf(tmp2)
5381+
with open_mfdataset(
5382+
[tmp1, "non-existent-file.nc", tmp2],
5383+
concat_dim="x",
5384+
combine="nested",
5385+
errors="ignore",
5386+
) as actual:
5387+
assert_identical(original, actual)
5388+
5389+
def test_open_mfdataset_with_warn(self) -> None:
5390+
original = Dataset({"foo": ("x", np.random.randn(10))})
5391+
with pytest.warns(UserWarning, match="Ignoring."):
5392+
with create_tmp_files(2) as (tmp1, tmp2):
5393+
ds1 = original.isel(x=slice(5))
5394+
ds2 = original.isel(x=slice(5, 10))
5395+
ds1.to_netcdf(tmp1)
5396+
ds2.to_netcdf(tmp2)
5397+
with open_mfdataset(
5398+
[tmp1, "non-existent-file.nc", tmp2],
5399+
concat_dim="x",
5400+
combine="nested",
5401+
errors="warn",
5402+
) as actual:
5403+
assert_identical(original, actual)
5404+
5405+
def test_open_mfdataset_2d_with_ignore(self) -> None:
5406+
original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))})
5407+
with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4):
5408+
original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1)
5409+
original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2)
5410+
original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3)
5411+
original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4)
5412+
with open_mfdataset(
5413+
[[tmp1, tmp2], ["non-existent-file.nc", tmp3, tmp4]],
5414+
combine="nested",
5415+
concat_dim=["y", "x"],
5416+
errors="ignore",
5417+
) as actual:
5418+
assert_identical(original, actual)
5419+
5420+
def test_open_mfdataset_2d_with_warn(self) -> None:
5421+
original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))})
5422+
with pytest.warns(UserWarning, match="Ignoring."):
5423+
with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4):
5424+
original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1)
5425+
original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2)
5426+
original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3)
5427+
original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4)
5428+
with open_mfdataset(
5429+
[[tmp1, tmp2, "non-existent-file.nc"], [tmp3, tmp4]],
5430+
combine="nested",
5431+
concat_dim=["y", "x"],
5432+
errors="warn",
5433+
) as actual:
5434+
assert_identical(original, actual)
5435+
53745436
def test_attrs_mfdataset(self) -> None:
53755437
original = Dataset({"foo": ("x", np.random.randn(10))})
53765438
with create_tmp_file() as tmp1:

0 commit comments

Comments
 (0)