Skip to content

Commit 709e645

Browse files
euronionpre-commit-ci[bot]coroa
authored
Code: Download era5 as grib (#439)
* [WIP] code: Enable downloading of ERA5 as GRIB * code: simplifications * code: simplifications cont * code: simpify * code: final simp * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * doc: add release notes * doc: add docstrings * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix(era5): open gribfile directly * chore(era5): remove cds alpha compatibility time/valid_time * Use cutout chunks also as netcdf chunksizes encoding * fix: finalizer for grib format --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jonas Hoersch <[email protected]>
1 parent a32eacc commit 709e645

File tree

4 files changed

+185
-13
lines changed

4 files changed

+185
-13
lines changed

RELEASE_NOTES.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ Release Notes
2020
`v0.4.0 <https://github.com/PyPSA/atlite/releases/tag/v0.4.0>`__ (30th January 2025)
2121
=======================================================================================
2222

23+
* ERA5 data is now by default using the `grib` backened instead of the `netcdf` backend of the CDS.
24+
This change became necessary, as the `netcdf` backend became size-limited, cf. (https://forum.ecmwf.int/t/limitation-change-on-netcdf-era5-requests/12477).
25+
Side effects include that downloads of ERA5 data should now be faster and possible for larger extends,
26+
with a downside of more processing locally and larger temporary storage requirements.
27+
The yielded cutouts should be identical in their data to previous cutouts.
28+
You can still use the `netcdf` backend by passing `data_format='netcdf'` to the `cutout.prepare()` method.
29+
(https://github.com/PyPSA/atlite/issues/439)
30+
2331
* The methods ``convert_cooling_demand`` and ``cooling_demand`` are implemented
2432
to evaluate cooling demand using the cooling degree-days approximation.
2533
(https://github.com/PyPSA/atlite/pull/415, https://github.com/PyPSA/atlite/pull/422)

atlite/data.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def get_features(
2727
cutout,
2828
module,
2929
features,
30+
data_format,
3031
tmpdir=None,
3132
monthly_requests=False,
3233
concurrent_requests=False,
@@ -47,6 +48,7 @@ def get_features(
4748
cutout,
4849
feature,
4950
tmpdir=tmpdir,
51+
data_format=data_format,
5052
lock=lock,
5153
monthly_requests=monthly_requests,
5254
concurrent_requests=concurrent_requests,
@@ -58,9 +60,15 @@ def get_features(
5860

5961
ds = xr.merge(datasets, compat="equals")
6062
for v in ds:
61-
ds[v].attrs["module"] = module
63+
da = ds[v]
64+
da.attrs["module"] = module
6265
fd = datamodules[module].features.items()
63-
ds[v].attrs["feature"] = [k for k, l in fd if v in l].pop()
66+
da.attrs["feature"] = [k for k, l in fd if v in l].pop()
67+
68+
if da.chunks is not None:
69+
chunksizes = [c[0] for c in da.chunks]
70+
da.encoding["chunksizes"] = chunksizes
71+
6472
return ds
6573

6674

@@ -125,6 +133,7 @@ def cutout_prepare(
125133
cutout,
126134
features=None,
127135
tmpdir=None,
136+
data_format="grib",
128137
overwrite=False,
129138
compression={"zlib": True, "complevel": 9, "shuffle": True},
130139
show_progress=False,
@@ -153,6 +162,8 @@ def cutout_prepare(
153162
Directory in which temporary files (for example retrieved ERA5 netcdf
154163
files) are stored. If set, the directory will not be deleted and the
155164
intermediate files can be examined.
165+
data_format : str, optional
166+
The data format used to retrieve the data. Only relevant for ERA5 data. The default is 'grib'.
156167
overwrite : bool, optional
157168
Whether to overwrite variables which are already included in the
158169
cutout. The default is False.
@@ -210,6 +221,7 @@ def cutout_prepare(
210221
module,
211222
missing_features,
212223
tmpdir=tmpdir,
224+
data_format=data_format,
213225
monthly_requests=monthly_requests,
214226
concurrent_requests=concurrent_requests,
215227
)

atlite/datasets/era5.py

Lines changed: 161 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import os
1313
import warnings
1414
import weakref
15+
from pathlib import Path
1516
from tempfile import mkstemp
1617

1718
import cdsapi
@@ -20,6 +21,7 @@
2021
import xarray as xr
2122
from dask import compute, delayed
2223
from dask.array import arctan2, sqrt
24+
from dask.utils import SerializableLock
2325
from numpy import atleast_1d
2426

2527
from atlite.gis import maybe_swap_spatial_dims
@@ -86,9 +88,7 @@ def _rename_and_clean_coords(ds, add_lon_lat=True):
8688
Optionally (add_lon_lat, default:True) preserves latitude and
8789
longitude columns as 'lat' and 'lon'.
8890
"""
89-
ds = ds.rename({"longitude": "x", "latitude": "y"})
90-
if "valid_time" in ds.sizes:
91-
ds = ds.rename({"valid_time": "time"}).unify_chunks()
91+
ds = ds.rename({"longitude": "x", "latitude": "y", "valid_time": "time"})
9292
# round coords since cds coords are float32 which would lead to mismatches
9393
ds = ds.assign_coords(
9494
x=np.round(ds.x.astype(float), 5), y=np.round(ds.y.astype(float), 5)
@@ -331,20 +331,161 @@ def noisy_unlink(path):
331331
logger.error(f"Unable to delete file {path}, as it is still in use.")
332332

333333

334-
def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates):
334+
def add_finalizer(ds: xr.Dataset, target: str | Path):
335+
logger.debug(f"Adding finalizer for {target}")
336+
weakref.finalize(ds._close.__self__.ds, noisy_unlink, target)
337+
338+
339+
def sanitize_chunks(chunks, **dim_mapping):
340+
dim_mapping = dict(time="valid_time", x="longitude", y="latitude") | dim_mapping
341+
if not isinstance(chunks, dict):
342+
# preserve "auto" or None
343+
return chunks
344+
345+
return {
346+
extname: chunks[intname]
347+
for intname, extname in dim_mapping.items()
348+
if intname in chunks
349+
}
350+
351+
352+
def open_with_grib_conventions(
353+
grib_file: str | Path, chunks=None, tmpdir: str | Path | None = None
354+
) -> xr.Dataset:
355+
"""
356+
Convert grib file of ERA5 data from the CDS to netcdf file.
357+
358+
The function does the same thing as the CDS backend does, but locally.
359+
This is needed, as the grib file is the recommended download file type for CDS, with conversion to netcdf locally.
360+
The routine is a reduced version based on the documentation here:
361+
https://confluence.ecmwf.int/display/CKB/GRIB+to+netCDF+conversion+on+new+CDS+and+ADS+systems#GRIBtonetCDFconversiononnewCDSandADSsystems-jupiternotebook
362+
363+
Parameters
364+
----------
365+
grib_file : str | Path
366+
Path to the grib file to be converted.
367+
chunks
368+
Chunks
369+
tmpdir : Path, optional
370+
If None adds a finalizer to the dataset object
371+
372+
Returns
373+
-------
374+
xr.Dataset
375+
"""
376+
#
377+
# Open grib file as dataset
378+
# Options to open different datasets into a datasets of consistent hypercubes which are compatible netCDF
379+
# There are options that might be relevant for e.g. for wave model data, that have been removed here
380+
# to keep the code cleaner and shorter
381+
ds = xr.open_dataset(
382+
grib_file,
383+
engine="cfgrib",
384+
time_dims=["valid_time"],
385+
ignore_keys=["edition"],
386+
# extra_coords={"expver": "valid_time"},
387+
coords_as_attributes=[
388+
"surface",
389+
"depthBelowLandLayer",
390+
"entireAtmosphere",
391+
"heightAboveGround",
392+
"meanSea",
393+
],
394+
chunks=sanitize_chunks(chunks),
395+
)
396+
if tmpdir is None:
397+
add_finalizer(ds, grib_file)
398+
399+
def safely_expand_dims(dataset: xr.Dataset, expand_dims: list[str]) -> xr.Dataset:
400+
"""
401+
Expand dimensions in an xarray dataset, ensuring that the new dimensions are not already in the dataset
402+
and that the order of dimensions is preserved.
403+
"""
404+
dims_required = [
405+
c for c in dataset.coords if c in expand_dims + list(dataset.dims)
406+
]
407+
dims_missing = [
408+
(c, i) for i, c in enumerate(dims_required) if c not in dataset.dims
409+
]
410+
dataset = dataset.expand_dims(
411+
dim=[x[0] for x in dims_missing], axis=[x[1] for x in dims_missing]
412+
)
413+
return dataset
414+
415+
logger.debug("Converting grib file to netcdf format")
416+
# Variables and dimensions to rename if they exist in the dataset
417+
rename_vars = {
418+
"time": "forecast_reference_time",
419+
"step": "forecast_period",
420+
"isobaricInhPa": "pressure_level",
421+
"hybrid": "model_level",
422+
}
423+
rename_vars = {k: v for k, v in rename_vars.items() if k in ds}
424+
ds = ds.rename(rename_vars)
425+
426+
# safely expand dimensions in an xarray dataset to ensure that data for the new dimensions are in the dataset
427+
ds = safely_expand_dims(ds, ["valid_time", "pressure_level", "model_level"])
428+
429+
return ds
430+
431+
432+
def retrieve_data(
433+
product: str,
434+
chunks: dict[str, int] | None = None,
435+
tmpdir: str | Path | None = None,
436+
lock: SerializableLock | None = None,
437+
**updates,
438+
) -> xr.Dataset:
335439
"""
336440
Download data like ERA5 from the Climate Data Store (CDS).
337441
338442
If you want to track the state of your request go to
339443
https://cds-beta.climate.copernicus.eu/requests?tab=all
444+
445+
Parameters
446+
----------
447+
product : str
448+
Product name, e.g. 'reanalysis-era5-single-levels'.
449+
chunks : dict, optional
450+
Chunking for xarray dataset, e.g. {'time': 1, 'x': 100, 'y': 100}.
451+
Default is None.
452+
tmpdir : str, optional
453+
Directory where the downloaded data is temporarily stored.
454+
Default is None, which uses the system's temporary directory.
455+
lock : dask.utils.SerializableLock, optional
456+
Lock for thread-safe file writing. Default is None.
457+
updates : dict
458+
Additional parameters for the request.
459+
Must include 'year', 'month', and 'variable'.
460+
Can include e.g. 'data_format'.
461+
462+
Returns
463+
-------
464+
xarray.Dataset
465+
Dataset with the retrieved variables.
466+
467+
Examples
468+
--------
469+
>>> ds = retrieve_data(
470+
... product='reanalysis-era5-single-levels',
471+
... chunks={'time': 1, 'x': 100, 'y': 100},
472+
... tmpdir='/tmp',
473+
... lock=None,
474+
... year='2020',
475+
... month='01',
476+
... variable=['10m_u_component_of_wind', '10m_v_component_of_wind'],
477+
... data_format='netcdf'
478+
... )
340479
"""
341-
request = {"product_type": "reanalysis", "format": "netcdf"}
480+
request = {"product_type": ["reanalysis"], "download_format": "unarchived"}
342481
request.update(updates)
343482

344483
assert {"year", "month", "variable"}.issubset(request), (
345484
"Need to specify at least 'variable', 'year' and 'month'"
346485
)
347486

487+
logger.debug(f"Requesting {product} with API request: {request}")
488+
348489
client = cdsapi.Client(
349490
info_callback=logger.debug, debug=logging.DEBUG >= logging.root.level
350491
)
@@ -353,8 +494,9 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates):
353494
if lock is None:
354495
lock = nullcontext()
355496

497+
suffix = f".{request['data_format']}" # .netcdf or .grib
356498
with lock:
357-
fd, target = mkstemp(suffix=".nc", dir=tmpdir)
499+
fd, target = mkstemp(suffix=suffix, dir=tmpdir)
358500
os.close(fd)
359501

360502
# Inform user about data being downloaded as "* variable (year-month)"
@@ -364,10 +506,13 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates):
364506
logger.info(f"CDS: Downloading variables\n\t{varstr}\n")
365507
result.download(target)
366508

367-
ds = xr.open_dataset(target, chunks=chunks or {})
368-
if tmpdir is None:
369-
logger.debug(f"Adding finalizer for {target}")
370-
weakref.finalize(ds._file_obj._manager, noisy_unlink, target)
509+
# Convert from grib to netcdf locally, same conversion as in CDS backend
510+
if request["data_format"] == "grib":
511+
ds = open_with_grib_conventions(target, chunks=chunks, tmpdir=tmpdir)
512+
else:
513+
ds = xr.open_dataset(target, chunks=sanitize_chunks(chunks))
514+
if tmpdir is None:
515+
add_finalizer(target)
371516

372517
return ds
373518

@@ -377,6 +522,7 @@ def get_data(
377522
feature,
378523
tmpdir,
379524
lock=None,
525+
data_format="grib",
380526
monthly_requests=False,
381527
concurrent_requests=False,
382528
**creation_parameters,
@@ -399,6 +545,9 @@ def get_data(
399545
If True, the data is requested on a monthly basis in ERA5. This is useful for
400546
large cutouts, where the data is requested in smaller chunks. The
401547
default is False
548+
data_format : str, optional
549+
The format of the data to be downloaded. Can be either 'grib' or 'netcdf',
550+
'grib' highly recommended because CDSAPI limits request size for netcdf.
402551
concurrent_requests : bool, optional
403552
If True, the monthly data requests are posted concurrently.
404553
Only has an effect if `monthly_requests` is True.
@@ -420,9 +569,10 @@ def get_data(
420569
"product": "reanalysis-era5-single-levels",
421570
"area": _area(coords),
422571
"chunks": cutout.chunks,
423-
"grid": [cutout.dx, cutout.dy],
572+
"grid": f"{cutout.dx}/{cutout.dy}",
424573
"tmpdir": tmpdir,
425574
"lock": lock,
575+
"data_format": data_format,
426576
}
427577

428578
func = globals().get(f"get_data_{feature}")

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ dependencies = [
4747
"pyproj>=2",
4848
"geopandas",
4949
"cdsapi>=0.7.4",
50+
"cfgrib>=0.9.15.0",
51+
"h5netcdf>=1.6.1",
5052
]
5153

5254
[project.urls]

0 commit comments

Comments
 (0)