diff --git a/recipes/antgg-2015/meta.yaml b/recipes/antgg-2015/meta.yaml new file mode 100644 index 0000000000..0050356f33 --- /dev/null +++ b/recipes/antgg-2015/meta.yaml @@ -0,0 +1,39 @@ +# Name for dataset. User chosen. +title: 'ANTGG-2015' +# Description of dataset. User chosen, roughly 1 sentence in length. +description: 'ANTGG dataset converted to zarr stores from a netCDF file' +# Version of pangeo_forge_recipes library that was used +pangeo_forge_version: '0.10.4' +# The recipes section tells Pangeo Cloud where to find the recipes within your PR. +# Many recipe PRs will have just 1 recipe, in which case this section will look similar to the example below. +# If your PR contains multiple recipes, you may add additional elements to the list below. +recipes: + # User chosen name for recipe. Likely similar to dataset name, ~25 characters in length + - id: antgg-2015 + # The `object` below tells Pangeo Cloud specifically where your recipe instance(s) are located and uses the format : + # is name of .py file where the Python recipe object is defined. + # For example, if is given as "recipe", Pangeo Cloud will expect a file named `recipe.py` to exist in your PR. + # is the name of the recipe object (i.e. Python class instance) _within_ the specified file. + # For example, if you have defined `recipe = XarrayZarrRecipe(...)` within a file named `recipe.py`, then your `object` below would be `"recipe:recipe"` + object: recipe:transforms +provenance: + # Data provider object. Follow STAC spec. + # https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#provider-object + providers: + - name: 'PANGAEA' + description: 'Data Publisher for Earth & Environmental Science' + roles: + - licensor + url: https://doi.pangaea.de/10.1594/PANGAEA.848168 + # This is a required field for provider. Follow STAC spec + # https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#license + license: 'CC-BY-3.0' +maintainers: + # Information about recipe creator. name and github are required + - name: 'Matthew Tankersley' + orcid: '0000-0003-4266-8554' + github: mdtanker +# The specific bakery (i.e. cloud infrastructure) that your recipe will run on. +# Available bakeries can be found on the Pangeo Forge website https://pangeo-forge.org/dashboard/bakeries +# bakery: +# id: 'pangeo-ldeo-nsf-earthcube' diff --git a/recipes/antgg-2015/recipe.py b/recipes/antgg-2015/recipe.py new file mode 100644 index 0000000000..5f6bdd62db --- /dev/null +++ b/recipes/antgg-2015/recipe.py @@ -0,0 +1,62 @@ +import apache_beam as beam +import zarr + +from pangeo_forge_recipes.patterns import FilePattern +from pangeo_forge_recipes.transforms import ( + Indexed, + OpenURLWithFSSpec, + OpenWithXarray, + StoreToZarr, + T, +) + + +def make_url(): + return 'https://hs.pangaea.de/Maps/antgg2015/antgg2015.nc' + + +pattern = FilePattern(make_url) + + +def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: + # This fails integration test if not imported here + import xarray as xr + + ds = xr.open_dataset(store, engine='zarr', consolidated=True, chunks={}) + for var in [ + 'ellipsoidal_height', + 'orthometric_height', + 'free_air_anomaly', + 'accuracy_measure', + 'bouguer_anomaly', + ]: + assert var in ds.data_vars + + +class Preprocess(beam.PTransform): + """Preprocessor transform.""" + + @staticmethod + def _preproc(item: Indexed[T]) -> Indexed[T]: + index, ds = item + ds['x'] = ds.x * 1000 + ds['y'] = ds.y * 1000 + ds = ds.drop(['longitude', 'latitude', 'crs']) + return index, ds + + def expand(self, pcoll: beam.PCollection) -> beam.PCollection: + return pcoll | beam.Map(self._preproc) + + +transforms = ( + beam.Create(pattern.items()) + | OpenURLWithFSSpec() + | OpenWithXarray( + file_type=pattern.file_type, + ) + | Preprocess() + | StoreToZarr( + store_name='antgg2015.zarr', + ) + | 'Test dataset' >> beam.Map(test_ds) +)