|
| 1 | +""" |
| 2 | +Saving a one year subset (with select data variables) of |
| 3 | +[WeatherBench2](https://weatherbench2.readthedocs.io/en/latest/data-guide.html) |
| 4 | +to a local Zarr store. |
| 5 | +
|
| 6 | + python 0_weatherbench2zarr.py |
| 7 | +
|
| 8 | +Full dataset(s) can be downloaded from |
| 9 | +https://console.cloud.google.com/storage/browser/weatherbench2 |
| 10 | +""" |
| 11 | +import os |
| 12 | + |
| 13 | +import xarray as xr |
| 14 | + |
| 15 | +# %% |
| 16 | +# Temporal subset of WeatherBench2 dataset to 2 years |
| 17 | +store_name: str = "2020-full_37-6h-0p25deg-chunk-1_zuv500.zarr" |
| 18 | +if not os.path.exists(path=store_name): |
| 19 | + # Open WeatherBench2 Zarr store from Google Cloud Storage |
| 20 | + ds: xr.Dataset = xr.open_dataset( |
| 21 | + filename_or_obj="gs://weatherbench2/datasets/era5/1959-2022-full_37-6h-0p25deg-chunk-1.zarr-v2", |
| 22 | + engine="zarr", |
| 23 | + chunks="auto", |
| 24 | + consolidated=True, |
| 25 | + ) |
| 26 | + |
| 27 | + # Subset to year 2000, 500hPa pressure level |
| 28 | + ds_500hpa = ds.sel(time=slice("2000-01-01", "2000-12-31"), level=500, drop=True) |
| 29 | + |
| 30 | + # Get data variable z500, u500, v500 |
| 31 | + ds_500hpa_zuv = ds_500hpa.get( |
| 32 | + key=["geopotential", "u_component_of_wind", "v_component_of_wind"] |
| 33 | + ) |
| 34 | + |
| 35 | + # Disable LZ4 compression, since cupy-xarray cannot handle it yet |
| 36 | + for var in ds_500hpa_zuv.variables: |
| 37 | + ds_500hpa_zuv[var].encoding["compressor"] = None |
| 38 | + |
| 39 | + # Save to Zarr with chunks of size 1 along time dimension |
| 40 | + # Can take about 1 hour to save 10.7GB of data at 40MB/s |
| 41 | + ds_rechunked: xr.Dataset = ds_500hpa_zuv.chunk( |
| 42 | + time=1, |
| 43 | + latitude=len(ds_500hpa_zuv.latitude), |
| 44 | + longitude=len(ds_500hpa_zuv.longitude), |
| 45 | + ) |
| 46 | + ds_rechunked.to_zarr(store=store_name, consolidated=True, zarr_version=2) |
| 47 | + |
| 48 | +# Read back Zarr store using kvikIO engine to |
| 49 | +# ensure things were saved correctly and can be loaded into GPU directly |
| 50 | +ds_zarr: xr.Dataset = xr.open_dataset( |
| 51 | + filename_or_obj=store_name, engine="kvikio", consolidated=False |
| 52 | +) |
| 53 | +print(ds_zarr) |
| 54 | +print(ds_zarr.u_component_of_wind) |
| 55 | +print(f"Loaded as {ds_zarr.u_component_of_wind.isel(time=0).data.__class__}") |
0 commit comments