|
1 | 1 | # Copyright (c) 2025 by Brockmann Consult GmbH |
2 | 2 | # Permissions are hereby granted under the terms of the MIT License: |
3 | 3 | # https://opensource.org/licenses/MIT. |
4 | | - |
| 4 | +import re |
5 | 5 | from collections.abc import Iterable |
6 | 6 | import logging |
7 | 7 | import os |
|
10 | 10 | import shutil |
11 | 11 | import tarfile |
12 | 12 | import tempfile |
13 | | -from typing import Any |
| 13 | +from typing import Any, Mapping |
14 | 14 | import xml.etree |
15 | 15 | import zipfile |
16 | 16 |
|
@@ -73,30 +73,43 @@ def read_dataset_from_archive( |
73 | 73 | return read_dataset_from_directory(data_dirs[0]) |
74 | 74 |
|
75 | 75 |
|
76 | | -def read_dataset_from_directory(data_dir): |
77 | | - LOGGER.info(f"Processing {data_dir}") |
| 76 | +def read_dataset_from_directory(data_dir: str | os.PathLike[Any]): |
| 77 | + data_path = pathlib.Path(data_dir) |
| 78 | + LOGGER.info(f"Processing {data_path}") |
78 | 79 | arrays = { |
79 | | - name: rioxarray.open_rasterio( |
80 | | - str(data_dir) + "/" + (filename + ".TIF") |
81 | | - ).squeeze() |
82 | | - for name, filename in VAR_MAP.items() |
| 80 | + name: rioxarray.open_rasterio(filename).squeeze() |
| 81 | + for name, filename in find_datafiles(data_path).items() |
83 | 82 | } |
84 | 83 | ds = xr.Dataset(arrays) |
85 | | - add_metadata(ds, data_dir) |
| 84 | + add_metadata(ds, data_path) |
86 | 85 | return ds |
87 | 86 |
|
88 | 87 |
|
| 88 | +def find_datafiles(data_path: pathlib.Path) -> Mapping[str, pathlib.Path]: |
| 89 | + assert data_path.is_dir() |
| 90 | + tiffs = list(data_path.glob("*.TIF")) |
| 91 | + result = {} |
| 92 | + for name, basename in VAR_MAP.items(): |
| 93 | + pattern = f"(ENMAP.*)?{basename}.TIF" |
| 94 | + matches = [tiff for tiff in tiffs if re.match(pattern, tiff.name)] |
| 95 | + assert len(matches) > 0, f"Can't find TIFF for {name}" |
| 96 | + assert len(matches) < 2, f"Too many TIFFs for {name}" |
| 97 | + result[name] = matches[0] |
| 98 | + return result |
| 99 | + |
| 100 | + |
89 | 101 | def add_metadata(ds: xr.Dataset, data_dir: pathlib.Path): |
| 102 | + metadata_paths = list(data_dir.glob("*METADATA.XML")) |
| 103 | + assert len(metadata_paths) == 1 |
| 104 | + metadata_path = metadata_paths[0] |
90 | 105 | if str(data_dir).startswith("s3://"): |
91 | 106 | import fsspec |
92 | 107 |
|
93 | 108 | fs = fsspec.filesystem("s3") |
94 | | - with fs.open(str(data_dir) + "/" + "METADATA.XML") as fh: |
| 109 | + with fs.open(metadata_path) as fh: |
95 | 110 | root = xml.etree.ElementTree.parse(fh).getroot() |
96 | 111 | else: |
97 | | - root = xml.etree.ElementTree.parse( |
98 | | - str(data_dir) + "/" + "METADATA.XML" |
99 | | - ).getroot() |
| 112 | + root = xml.etree.ElementTree.parse(metadata_path).getroot() |
100 | 113 | points = root.findall("base/spatialCoverage/boundingPolygon/point") |
101 | 114 | bounds = shapely.Polygon( |
102 | 115 | [float(p.find("longitude").text), p.find("latitude").text] |
@@ -232,6 +245,9 @@ def extract_zip( |
232 | 245 | output_data_path = final_path / input_data_dir |
233 | 246 | prefix_length = len(input_data_path.name) + 1 |
234 | 247 | os.mkdir(output_data_path) |
| 248 | + # Strip the long, redundant prefix from the filenames. Not visible anyway |
| 249 | + # via the xarray plugin, but convenient if using this function as a |
| 250 | + # standalone archive extractor. |
235 | 251 | for filepath in input_data_path.iterdir(): |
236 | 252 | os.rename(filepath, output_data_path / filepath.name[prefix_length:]) |
237 | 253 | return output_data_path |
0 commit comments