11# Copyright (c) 2025 by Brockmann Consult GmbH
22# Permissions are hereby granted under the terms of the MIT License:
33# https://opensource.org/licenses/MIT.
4-
4+ import re
55from collections .abc import Iterable
66import logging
77import os
1010import shutil
1111import tarfile
1212import tempfile
13- from typing import Any
13+ from typing import Any , Mapping
1414import xml .etree
1515import zipfile
1616
@@ -49,9 +49,9 @@ def open_dataset(
4949 if path .is_file ():
5050 ds = read_dataset_from_archive (filename_or_obj , self .temp_dir )
5151 elif path .is_dir ():
52- ds = read_dataset_from_directory (path )
52+ ds = read_dataset_from_unknown_directory (path , self . temp_dir )
5353 elif filename_or_obj .startswith ("s3://" ):
54- ds = read_dataset_from_directory (filename_or_obj )
54+ ds = read_dataset_from_inner_directory (filename_or_obj )
5555 else :
5656 raise ValueError (
5757 f"{ filename_or_obj } is neither a path nor a directory."
@@ -65,38 +65,67 @@ def close(self):
6565
6666
6767def read_dataset_from_archive (
68- input_filename : str , temp_dir : str
68+ input_filename : str | os . PathLike [ Any ] , temp_dir : str
6969) -> xr .Dataset :
7070 data_dirs = list (extract_archives (input_filename , temp_dir ))
7171 if len (data_dirs ) > 1 :
7272 LOGGER .warning ("Multiple data archives found; reading the first." )
73- return read_dataset_from_directory (data_dirs [0 ])
73+ return read_dataset_from_inner_directory (data_dirs [0 ])
74+
75+
76+ def read_dataset_from_unknown_directory (
77+ data_dir : str | os .PathLike [Any ], temp_dir : str
78+ ):
79+ data_path = pathlib .Path (data_dir )
80+ metadata_files = list (data_path .glob ("*METADATA.XML" ))
81+ match len (metadata_files ):
82+ case 0 :
83+ # assume outer directory
84+ return read_dataset_from_archive (data_path , temp_dir )
85+ case 1 :
86+ # assume inner directory
87+ return read_dataset_from_inner_directory (data_path )
88+ case _:
89+ raise RuntimeError ("Too many METADATA.XML files" )
7490
7591
76- def read_dataset_from_directory (data_dir ):
77- LOGGER .info (f"Processing { data_dir } " )
92+ def read_dataset_from_inner_directory (data_dir : str | os .PathLike [Any ]):
93+ data_path = pathlib .Path (data_dir )
94+ LOGGER .info (f"Processing { data_path } " )
7895 arrays = {
79- name : rioxarray .open_rasterio (
80- str (data_dir ) + "/" + (filename + ".TIF" )
81- ).squeeze ()
82- for name , filename in VAR_MAP .items ()
96+ name : rioxarray .open_rasterio (filename ).squeeze ()
97+ for name , filename in find_datafiles (data_path ).items ()
8398 }
8499 ds = xr .Dataset (arrays )
85- add_metadata (ds , data_dir )
100+ add_metadata (ds , data_path )
86101 return ds
87102
88103
104+ def find_datafiles (data_path : pathlib .Path ) -> Mapping [str , pathlib .Path ]:
105+ assert data_path .is_dir ()
106+ tiffs = list (data_path .glob ("*.TIF" ))
107+ result = {}
108+ for name , basename in VAR_MAP .items ():
109+ pattern = f"(ENMAP.*)?{ basename } .TIF"
110+ matches = [tiff for tiff in tiffs if re .match (pattern , tiff .name )]
111+ assert len (matches ) > 0 , f"Can't find TIFF for { name } "
112+ assert len (matches ) < 2 , f"Too many TIFFs for { name } "
113+ result [name ] = matches [0 ]
114+ return result
115+
116+
89117def add_metadata (ds : xr .Dataset , data_dir : pathlib .Path ):
118+ metadata_paths = list (data_dir .glob ("*METADATA.XML" ))
119+ assert len (metadata_paths ) == 1
120+ metadata_path = metadata_paths [0 ]
90121 if str (data_dir ).startswith ("s3://" ):
91122 import fsspec
92123
93124 fs = fsspec .filesystem ("s3" )
94- with fs .open (str ( data_dir ) + "/" + "METADATA.XML" ) as fh :
125+ with fs .open (metadata_path ) as fh :
95126 root = xml .etree .ElementTree .parse (fh ).getroot ()
96127 else :
97- root = xml .etree .ElementTree .parse (
98- str (data_dir ) + "/" + "METADATA.XML"
99- ).getroot ()
128+ root = xml .etree .ElementTree .parse (metadata_path ).getroot ()
100129 points = root .findall ("base/spatialCoverage/boundingPolygon/point" )
101130 bounds = shapely .Polygon (
102131 [float (p .find ("longitude" ).text ), p .find ("latitude" ).text ]
@@ -190,13 +219,16 @@ def extract_archives(
190219 final_path = dest_path / "data"
191220 os .mkdir (final_path )
192221 archive_path = pathlib .Path (archive_path )
193- if archive_path .name .endswith (".tar.gz" ):
194- # An EnMAP tgz usually contains one or more zip archives containing
195- # the actual data files.
196- outer_path = dest_path / "outer-archive"
197- LOGGER .info (f"Extracting { archive_path .name } " )
198- with tarfile .open (archive_path ) as tgz_file :
199- tgz_file .extractall (path = outer_path , filter = "data" )
222+ if archive_path .name .endswith (".tar.gz" ) or archive_path .is_dir ():
223+ if archive_path .is_dir ():
224+ outer_path = archive_path
225+ else :
226+ # An EnMAP tgz usually contains one or more zip archives containing
227+ # the actual data files.
228+ outer_path = dest_path / "outer-archive"
229+ LOGGER .info (f"Extracting { archive_path .name } " )
230+ with tarfile .open (archive_path ) as tgz_file :
231+ tgz_file .extractall (path = outer_path , filter = "data" )
200232 data_paths = []
201233 for index , path_to_zip_file in enumerate (find_zips (outer_path )):
202234 data_paths .append (
@@ -206,7 +238,7 @@ def extract_archives(
206238 else :
207239 # Assume it's a zip and skip the outer archive extraction step.
208240 LOGGER .info (f"Assuming { archive_path } is an inner zipfile" )
209- return [( extract_zip (final_path , 0 , inner_path , archive_path ) )]
241+ return [extract_zip (final_path , 0 , inner_path , archive_path )]
210242
211243
212244def find_zips (root : os .PathLike ):
@@ -232,6 +264,9 @@ def extract_zip(
232264 output_data_path = final_path / input_data_dir
233265 prefix_length = len (input_data_path .name ) + 1
234266 os .mkdir (output_data_path )
267+ # Strip the long, redundant prefix from the filenames. Not visible anyway
268+ # via the xarray plugin, but convenient if using this function as a
269+ # standalone archive extractor.
235270 for filepath in input_data_path .iterdir ():
236271 os .rename (filepath , output_data_path / filepath .name [prefix_length :])
237272 return output_data_path
0 commit comments