Skip to content

Commit 97e7b25

Browse files
authored
Merge pull request #5 from bcdev/pont-3-directories
Handle unzipped directories as input
2 parents a094ddb + f56ca37 commit 97e7b25

File tree

5 files changed

+67
-26
lines changed

5 files changed

+67
-26
lines changed

CHANGES.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## Changes in 0.0.2 (in development)
2+
3+
- Handle unpacked data directories (#3)
4+
15
## Changes in 0.0.1
26

37
Initial release

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ channels:
33
- conda-forge
44
dependencies:
55
- python >=3.11
6+
- fsspec
67
- rioxarray
78
- shapely
89
- xarray

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ license = "MIT"
2121
license-files = ["LICEN[CS]E*"]
2222
requires-python = ">=3.11"
2323
dependencies = [
24+
"fsspec",
2425
"rioxarray",
2526
"shapely",
2627
"xarray",

xarray_enmap/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.1"
1+
__version__ = "0.0.2.dev0"

xarray_enmap/xarray_enmap.py

Lines changed: 60 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Copyright (c) 2025 by Brockmann Consult GmbH
22
# Permissions are hereby granted under the terms of the MIT License:
33
# https://opensource.org/licenses/MIT.
4-
4+
import re
55
from collections.abc import Iterable
66
import logging
77
import os
@@ -10,7 +10,7 @@
1010
import shutil
1111
import tarfile
1212
import tempfile
13-
from typing import Any
13+
from typing import Any, Mapping
1414
import xml.etree
1515
import zipfile
1616

@@ -49,9 +49,9 @@ def open_dataset(
4949
if path.is_file():
5050
ds = read_dataset_from_archive(filename_or_obj, self.temp_dir)
5151
elif path.is_dir():
52-
ds = read_dataset_from_directory(path)
52+
ds = read_dataset_from_unknown_directory(path, self.temp_dir)
5353
elif filename_or_obj.startswith("s3://"):
54-
ds = read_dataset_from_directory(filename_or_obj)
54+
ds = read_dataset_from_inner_directory(filename_or_obj)
5555
else:
5656
raise ValueError(
5757
f"{filename_or_obj} is neither a path nor a directory."
@@ -65,38 +65,67 @@ def close(self):
6565

6666

6767
def read_dataset_from_archive(
68-
input_filename: str, temp_dir: str
68+
input_filename: str | os.PathLike[Any], temp_dir: str
6969
) -> xr.Dataset:
7070
data_dirs = list(extract_archives(input_filename, temp_dir))
7171
if len(data_dirs) > 1:
7272
LOGGER.warning("Multiple data archives found; reading the first.")
73-
return read_dataset_from_directory(data_dirs[0])
73+
return read_dataset_from_inner_directory(data_dirs[0])
74+
75+
76+
def read_dataset_from_unknown_directory(
77+
data_dir: str | os.PathLike[Any], temp_dir: str
78+
):
79+
data_path = pathlib.Path(data_dir)
80+
metadata_files = list(data_path.glob("*METADATA.XML"))
81+
match len(metadata_files):
82+
case 0:
83+
# assume outer directory
84+
return read_dataset_from_archive(data_path, temp_dir)
85+
case 1:
86+
# assume inner directory
87+
return read_dataset_from_inner_directory(data_path)
88+
case _:
89+
raise RuntimeError("Too many METADATA.XML files")
7490

7591

76-
def read_dataset_from_directory(data_dir):
77-
LOGGER.info(f"Processing {data_dir}")
92+
def read_dataset_from_inner_directory(data_dir: str | os.PathLike[Any]):
93+
data_path = pathlib.Path(data_dir)
94+
LOGGER.info(f"Processing {data_path}")
7895
arrays = {
79-
name: rioxarray.open_rasterio(
80-
str(data_dir) + "/" + (filename + ".TIF")
81-
).squeeze()
82-
for name, filename in VAR_MAP.items()
96+
name: rioxarray.open_rasterio(filename).squeeze()
97+
for name, filename in find_datafiles(data_path).items()
8398
}
8499
ds = xr.Dataset(arrays)
85-
add_metadata(ds, data_dir)
100+
add_metadata(ds, data_path)
86101
return ds
87102

88103

104+
def find_datafiles(data_path: pathlib.Path) -> Mapping[str, pathlib.Path]:
105+
assert data_path.is_dir()
106+
tiffs = list(data_path.glob("*.TIF"))
107+
result = {}
108+
for name, basename in VAR_MAP.items():
109+
pattern = f"(ENMAP.*)?{basename}.TIF"
110+
matches = [tiff for tiff in tiffs if re.match(pattern, tiff.name)]
111+
assert len(matches) > 0, f"Can't find TIFF for {name}"
112+
assert len(matches) < 2, f"Too many TIFFs for {name}"
113+
result[name] = matches[0]
114+
return result
115+
116+
89117
def add_metadata(ds: xr.Dataset, data_dir: pathlib.Path):
118+
metadata_paths = list(data_dir.glob("*METADATA.XML"))
119+
assert len(metadata_paths) == 1
120+
metadata_path = metadata_paths[0]
90121
if str(data_dir).startswith("s3://"):
91122
import fsspec
92123

93124
fs = fsspec.filesystem("s3")
94-
with fs.open(str(data_dir) + "/" + "METADATA.XML") as fh:
125+
with fs.open(metadata_path) as fh:
95126
root = xml.etree.ElementTree.parse(fh).getroot()
96127
else:
97-
root = xml.etree.ElementTree.parse(
98-
str(data_dir) + "/" + "METADATA.XML"
99-
).getroot()
128+
root = xml.etree.ElementTree.parse(metadata_path).getroot()
100129
points = root.findall("base/spatialCoverage/boundingPolygon/point")
101130
bounds = shapely.Polygon(
102131
[float(p.find("longitude").text), p.find("latitude").text]
@@ -190,13 +219,16 @@ def extract_archives(
190219
final_path = dest_path / "data"
191220
os.mkdir(final_path)
192221
archive_path = pathlib.Path(archive_path)
193-
if archive_path.name.endswith(".tar.gz"):
194-
# An EnMAP tgz usually contains one or more zip archives containing
195-
# the actual data files.
196-
outer_path = dest_path / "outer-archive"
197-
LOGGER.info(f"Extracting {archive_path.name}")
198-
with tarfile.open(archive_path) as tgz_file:
199-
tgz_file.extractall(path=outer_path, filter="data")
222+
if archive_path.name.endswith(".tar.gz") or archive_path.is_dir():
223+
if archive_path.is_dir():
224+
outer_path = archive_path
225+
else:
226+
# An EnMAP tgz usually contains one or more zip archives containing
227+
# the actual data files.
228+
outer_path = dest_path / "outer-archive"
229+
LOGGER.info(f"Extracting {archive_path.name}")
230+
with tarfile.open(archive_path) as tgz_file:
231+
tgz_file.extractall(path=outer_path, filter="data")
200232
data_paths = []
201233
for index, path_to_zip_file in enumerate(find_zips(outer_path)):
202234
data_paths.append(
@@ -206,7 +238,7 @@ def extract_archives(
206238
else:
207239
# Assume it's a zip and skip the outer archive extraction step.
208240
LOGGER.info(f"Assuming {archive_path} is an inner zipfile")
209-
return [(extract_zip(final_path, 0, inner_path, archive_path))]
241+
return [extract_zip(final_path, 0, inner_path, archive_path)]
210242

211243

212244
def find_zips(root: os.PathLike):
@@ -232,6 +264,9 @@ def extract_zip(
232264
output_data_path = final_path / input_data_dir
233265
prefix_length = len(input_data_path.name) + 1
234266
os.mkdir(output_data_path)
267+
# Strip the long, redundant prefix from the filenames. Not visible anyway
268+
# via the xarray plugin, but convenient if using this function as a
269+
# standalone archive extractor.
235270
for filepath in input_data_path.iterdir():
236271
os.rename(filepath, output_data_path / filepath.name[prefix_length:])
237272
return output_data_path

0 commit comments

Comments
 (0)