Skip to content

Commit 8b19ef7

Browse files
committed
Fix problem with zipped sub-archive extraction
This commit fixes a bug whereby trying to open a single-product sub-archive would open the first sub-archive in the parent directory (which may or may not be the requested sub-archive!).
1 parent 5165b91 commit 8b19ef7

File tree

2 files changed

+53
-33
lines changed

2 files changed

+53
-33
lines changed

README.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,14 @@ enmap_dataset = xr.open_dataset(
5050
)
5151
```
5252

53-
The path can be to either a `.tar.gz` archive as provided by the EnMAP portal,
54-
or to a directory containing the extracted archive contents.
53+
The supplied path can reference:
5554

56-
If the archive or directory contains multiple EnMAP products, xarray-enmap
57-
will open only the first. This will be improved in a future version.
55+
- a `.tar.gz` archive as provided by the EnMAP portal, containing one or
56+
more EnMAP products in `.ZIP` sub-archives, or
57+
- a `.ZIP` archive containing a single product, as found within an EnMAP
58+
`.tar.gz` archive, or
59+
- a directory contained the unpacked contents of either of the aforementioned
60+
archive types.
61+
62+
At present, if the archive or directory contains multiple EnMAP products,
63+
xarray-enmap will open only the first.

xarray_enmap/xarray_enmap.py

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ def open_dataset(
5353
elif filename_or_obj.startswith("s3://"):
5454
ds = read_dataset_from_directory(filename_or_obj)
5555
else:
56-
raise ValueError(f"{filename_or_obj} is neither a path nor a directory.")
56+
raise ValueError(
57+
f"{filename_or_obj} is neither a path nor a directory."
58+
)
5759
ds.set_close(self.close)
5860
return ds
5961

@@ -74,7 +76,9 @@ def read_dataset_from_archive(
7476
def read_dataset_from_directory(data_dir):
7577
LOGGER.info(f"Processing {data_dir}")
7678
arrays = {
77-
name: rioxarray.open_rasterio(str(data_dir) + "/" + (filename + ".TIF")).squeeze()
79+
name: rioxarray.open_rasterio(
80+
str(data_dir) + "/" + (filename + ".TIF")
81+
).squeeze()
7882
for name, filename in VAR_MAP.items()
7983
}
8084
ds = xr.Dataset(arrays)
@@ -85,11 +89,14 @@ def read_dataset_from_directory(data_dir):
8589
def add_metadata(ds: xr.Dataset, data_dir: pathlib.Path):
8690
if str(data_dir).startswith("s3://"):
8791
import fsspec
92+
8893
fs = fsspec.filesystem("s3")
8994
with fs.open(str(data_dir) + "/" + "METADATA.XML") as fh:
9095
root = xml.etree.ElementTree.parse(fh).getroot()
9196
else:
92-
root = xml.etree.ElementTree.parse(str(data_dir) + "/" + "METADATA.XML").getroot()
97+
root = xml.etree.ElementTree.parse(
98+
str(data_dir) + "/" + "METADATA.XML"
99+
).getroot()
93100
points = root.findall("base/spatialCoverage/boundingPolygon/point")
94101
bounds = shapely.Polygon(
95102
[float(p.find("longitude").text), p.find("latitude").text]
@@ -179,40 +186,27 @@ def extract_archives(
179186
archive_path: os.PathLike | str, dest_dir: os.PathLike | str
180187
) -> Iterable[pathlib.Path]:
181188
dest_path = pathlib.Path(dest_dir)
189+
inner_path = dest_path / "inner-archive"
190+
final_path = dest_path / "data"
191+
os.mkdir(final_path)
182192
archive_path = pathlib.Path(archive_path)
183193
if archive_path.name.endswith(".tar.gz"):
184-
# An EnMAP tgz usually contains one or more zip archives
185-
# containing the actual data files.
194+
# An EnMAP tgz usually contains one or more zip archives containing
195+
# the actual data files.
186196
outer_path = dest_path / "outer-archive"
187197
LOGGER.info(f"Extracting {archive_path.name}")
188198
with tarfile.open(archive_path) as tgz_file:
189199
tgz_file.extractall(path=outer_path, filter="data")
200+
data_paths = []
201+
for index, path_to_zip_file in enumerate(find_zips(outer_path)):
202+
data_paths.append(
203+
extract_zip(final_path, index, inner_path, path_to_zip_file)
204+
)
205+
return data_paths
190206
else:
191-
# Assume it's a zip and skip the outer archive
192-
# extraction step.
207+
# Assume it's a zip and skip the outer archive extraction step.
193208
LOGGER.info(f"Assuming {archive_path} is an inner zipfile")
194-
outer_path = archive_path.parent
195-
inner_path = dest_path / "inner-archive"
196-
197-
data_paths = []
198-
final_path = dest_path / "data"
199-
os.mkdir(final_path)
200-
for index, path_to_zip_file in enumerate(find_zips(outer_path)):
201-
LOGGER.info(f"Extracting {path_to_zip_file.name}")
202-
extract_path = inner_path / str(index)
203-
with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
204-
zip_ref.extractall(extract_path)
205-
input_data_path = list(extract_path.iterdir())[0]
206-
input_data_dir = input_data_path.name
207-
output_data_path = final_path / input_data_dir
208-
data_paths.append(output_data_path)
209-
prefix_length = len(input_data_path.name) + 1
210-
os.mkdir(output_data_path)
211-
for filepath in input_data_path.iterdir():
212-
os.rename(
213-
filepath, output_data_path / filepath.name[prefix_length:]
214-
)
215-
return data_paths
209+
return [(extract_zip(final_path, 0, inner_path, archive_path))]
216210

217211

218212
def find_zips(root: os.PathLike):
@@ -221,3 +215,23 @@ def find_zips(root: os.PathLike):
221215
for filename in files:
222216
if filename.endswith(".ZIP"):
223217
yield pathlib.Path(parent, filename)
218+
219+
220+
def extract_zip(
221+
final_path: pathlib.Path,
222+
index: int,
223+
inner_path: pathlib.Path,
224+
path_to_zip_file: pathlib.Path,
225+
) -> pathlib.Path:
226+
LOGGER.info(f"Extracting {path_to_zip_file.name}")
227+
extract_path = inner_path / str(index)
228+
with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
229+
zip_ref.extractall(extract_path)
230+
input_data_path = list(extract_path.iterdir())[0]
231+
input_data_dir = input_data_path.name
232+
output_data_path = final_path / input_data_dir
233+
prefix_length = len(input_data_path.name) + 1
234+
os.mkdir(output_data_path)
235+
for filepath in input_data_path.iterdir():
236+
os.rename(filepath, output_data_path / filepath.name[prefix_length:])
237+
return output_data_path

0 commit comments

Comments
 (0)