Merge pull request #30 from quantifyearth/mwd-habitat-process

mdales · web-flow · commit eb1b2abcf48d · 2025-11-04T16:59:19.000Z
Simplify and improve performance of habitat processing
diff --git a/CHANGES.md b/CHANGES.md
@@ -0,0 +1,22 @@
+## v1.1.0 (4/11/2025)
+
+### Added
+
+* Implementation of point validation based on [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/).
+
+### Changed
+
+* Performance improvements and simplification to habitat processing.
+* Store more analysis data from model validation.
+
+## v1.0.1 (19/10/2025)
+
+### Fixed
+
+* Fixed github action for publishing to pip.
+
+## v1.0.0 (19/10/2025)
+
+### Added
+
+* Initial release as stand alone package.
diff --git a/aoh/habitat_process.py b/aoh/habitat_process.py
@@ -1,18 +1,15 @@
 import argparse
-import math
 import os
 import logging
-import shutil
-import tempfile
 from functools import partial
 from multiprocessing import Pool, cpu_count
 from pathlib import Path
 from typing import Optional, Set
 
 import numpy as np
 import psutil
+import yirgacheffe as yg
 from osgeo import gdal   # type: ignore
-from yirgacheffe.layers import RasterLayer  # type: ignore
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s')
@@ -24,7 +21,7 @@ def _enumerate_subset(
     offset: int,
 ) -> Set[int]:
     gdal.SetCacheMax(1 * 1024 * 1024 * 1024)
-    with RasterLayer.layer_from_file(habitat_path) as habitat_map:
+    with yg.read_raster(habitat_path) as habitat_map:
         blocksize = min(BLOCKSIZE, habitat_map.window.ysize - offset)
         data = habitat_map.read_array(0, offset, habitat_map.window.xsize, blocksize)
         values = np.unique(data)
@@ -36,7 +33,7 @@ def enumerate_terrain_types(
     habitat_path: Path
 ) -> Set[int]:
     gdal.SetCacheMax(1 * 1024 * 1024 * 1024)
-    with RasterLayer.layer_from_file(habitat_path) as habitat_map:
+    with yg.read_raster(habitat_path) as habitat_map:
         ysize = habitat_map.window.ysize
     blocks = range(0, ysize, BLOCKSIZE)
     logger.info("Enumerating habitat classes in raster...")
@@ -51,44 +48,65 @@ def enumerate_terrain_types(
         pass
     return superset
 
-def _make_single_type_map(
+class VsimemFile:
+    def __init__(self, path):
+        self.path = path
+
+    def __enter__(self):
+        return self.path
+
+    def __exit__(self, *args):
+        try:
+            gdal.Unlink(self.path)
+        except RuntimeError:
+            pass
+
+def make_single_type_map(
     habitat_path: Path,
     pixel_scale: Optional[float],
     target_projection: Optional[str],
     output_directory_path: Path,
+    max_threads: int,
     habitat_value: int | float,
 ) -> None:
     logger.info("Building layer for %s...", habitat_value)
 
-    # We could do this via yirgacheffe if it wasn't for the need to
-    # both rescale and reproject. So we do the initial filtering
-    # in that, but then bounce it to a temporary file for the
-    # warping
-    with tempfile.TemporaryDirectory() as tmpdir:
-        with RasterLayer.layer_from_file(habitat_path) as habitat_map:
-            logger.info("Filtering for %s...", habitat_value)
-            calc = habitat_map == habitat_value
-            with RasterLayer.empty_raster_layer_like(habitat_map, datatype=gdal.GDT_Byte) as filtered_map:
-                calc.save(filtered_map)
+    mem_stats = psutil.virtual_memory()
+    available_mem = mem_stats.available
+    gdal.SetCacheMax(available_mem)
+    gdal.SetConfigOption('GDAL_NUM_THREADS', str(max_threads))
 
-                filename = f"lcc_{habitat_value}.tif"
-                tempname = os.path.join(tmpdir, filename)
+    with yg.read_raster(habitat_path) as habitat_map:
+        logger.info("Filtering for %s...", habitat_value)
 
-                dataset = filtered_map._dataset  # pylint: disable=W0212
+        # We use the GDAL in memory file system for all this
+        with VsimemFile(f"/vsimem/filtered_{habitat_value}.tif") as filter_map_path:
+            filtered_map = habitat_map == habitat_value
+            filtered_map.to_geotiff(filter_map_path, parallelism=max_threads)
+
+            with VsimemFile(f"/vsimem/warped_{habitat_value}.tif") as warped_map_path:
                 logger.info("Projecting %s...", habitat_value)
-                gdal.Warp(tempname, dataset, options=gdal.WarpOptions(
-                    creationOptions=['COMPRESS=LZW', 'NUM_THREADS=16'],
-                    multithread=True,
-                    dstSRS=target_projection,
-                    outputType=gdal.GDT_Float32,
-                    xRes=pixel_scale,
-                    yRes=((0.0 - pixel_scale) if pixel_scale else pixel_scale),
-                    resampleAlg="average",
-                    workingType=gdal.GDT_Float32
-                ))
-
-        logger.info("Saving %s...", habitat_value)
-        shutil.move(tempname, output_directory_path / filename)
+                gdal.Warp(
+                    warped_map_path,
+                    filter_map_path,
+                    options=gdal.WarpOptions(
+                        creationOptions=[],
+                        multithread=True,
+                        dstSRS=target_projection,
+                        outputType=gdal.GDT_Float32,
+                        xRes=pixel_scale,
+                        yRes=((0.0 - pixel_scale) if pixel_scale else pixel_scale),
+                        resampleAlg="average",
+                        warpOptions=[f'NUM_THREADS={max_threads}'],
+                        warpMemoryLimit=available_mem,
+                        workingType=gdal.GDT_Float32
+                    )
+                )
+
+                logger.info("Saving %s...", habitat_value)
+                filename = f"lcc_{habitat_value}.tif"
+                with yg.read_raster(warped_map_path) as result:
+                    result.to_geotiff(output_directory_path / filename)
 
 def habitat_process(
     habitat_path: Path,
@@ -99,48 +117,20 @@ def habitat_process(
 ) -> None:
     os.makedirs(output_directory_path, exist_ok=True)
 
-    with RasterLayer.layer_from_file(habitat_path) as habitat_map:
-        # The processing stage uses GDAL warp directly, with no chunking, so we should
-        # take a guess at how much memory we need based on the dimensions of the base map
-        pixels = habitat_map.window.xsize * habitat_map.window.ysize
-        # I really tried not to write this statement and use introspection, but nothing
-        # I tried gave a sensible answer. Normally I'd be more paranoid due to numpy bloat,
-        # but we're calling GDALwarp and passing it filenames, so everything should be done
-        # in the C++ world of GDAL, so I have more confidence that we won't see the usual
-        # 4x plus memory bloat of loading raster data into the python world.
-        match habitat_map.datatype:
-            case gdal.GDT_Byte | gdal.GDT_Int8:
-                pixel_size = 1
-            case gdal.GDT_CInt16 | gdal.GDT_Int16 | gdal.GDT_UInt16:
-                pixel_size = 2
-            case gdal.GDT_CFloat32 | gdal.GDT_CInt32 | gdal.GDT_Float32 | gdal.GDT_Int32:
-                pixel_size = 4
-            case _:
-                pixel_size = 8
-        estimated_memory = pixel_size * pixels
-
-        mem_stats = psutil.virtual_memory()
-        max_copies = math.floor((mem_stats.available * 0.8) / estimated_memory)
-        if max_copies == 0:
-            logger.warning("Low memory")
-            max_copies = 1
-        process_count = min(max_copies, process_count)
-        logger.info("Estimating we can run %s concurrent tasks", process_count)
-
     # We need to know how many terrains there are. We could get this from the crosswalk
     # table, but we can also work out the unique values ourselves. In practice this is
     # worth the effort, otherwise we generate a lot of empty maps potentially.
     habitats = enumerate_terrain_types(habitat_path)
 
-    if max_copies > 1:
-        with Pool(processes=process_count) as pool:
-            pool.map(
-                partial(_make_single_type_map, habitat_path, pixel_scale, target_projection, output_directory_path),
-                habitats
-            )
-    else:
-        for habitat in habitats:
-            _make_single_type_map(habitat_path, pixel_scale, target_projection, output_directory_path, habitat)
+    for habitat in habitats:
+        make_single_type_map(
+            habitat_path,
+            pixel_scale,
+            target_projection,
+            output_directory_path,
+            process_count,
+            habitat,
+        )
 
 def main() -> None:
     parser = argparse.ArgumentParser(description="Downsample habitat map to raster per terrain type.")
@@ -177,7 +167,7 @@ def main() -> None:
         "-j",
         type=int,
         required=False,
-        default=round(cpu_count() / 2),
+        default=cpu_count(),
         dest="processes_count",
         help="Optional number of concurrent threads to use."
     )
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "psutil",
     "pyproj>=3.4,<4.0",
     "scikit-image>=0.20,<1.0",
-    "yirgacheffe>=1.9.1,<2.0",
+    "yirgacheffe>=1.10.2,<2.0",
     "zenodo_search",
     "pandas>=2.0,<3.0",
     "gdal[numpy]>=3.8,<3.12",
diff --git a/tests/test_habitat_process.py b/tests/test_habitat_process.py
@@ -6,7 +6,7 @@
 import yirgacheffe as yg
 from osgeo import gdal   # type: ignore
 
-from aoh.habitat_process import enumerate_terrain_types, _make_single_type_map
+from aoh.habitat_process import enumerate_terrain_types, make_single_type_map
 
 def generate_habitat_map(
     output_path: Path,
@@ -47,11 +47,12 @@ def test_simple_make_single_map() -> None:
         generate_habitat_map(habitat_path, (20, 10), options)
         assert habitat_path.exists()
 
-        _make_single_type_map(
+        make_single_type_map(
             habitat_path,
             None,
             None,
             tmp,
+            1,
             100,
         )
         expected_result_path = tmp / "lcc_100.tif"
@@ -76,11 +77,12 @@ def test_rescale_make_single_map() -> None:
         generate_habitat_map(habitat_path, (20, 10), options)
         assert habitat_path.exists()
 
-        _make_single_type_map(
+        make_single_type_map(
             habitat_path,
             180.0 / 5.0, # Scale down by half
             None,
             tmp,
+            1,
             100,
         )
         expected_result_path = tmp / "lcc_100.tif"