Tighter encapsulation of "column requirements" of MultiBackendJobManager #741/#815

soxofaan · soxofaan · commit 8a69ac38d18e · 2025-11-05T19:56:48.000+01:00
Using another solution to break the import cycle

(and some doc/test tweaks along the way)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- Internal reorganization of `openeo.extra.job_management` submodule to ease future development ([#741](https://github.com/Open-EO/openeo-python-client/issues/741))
+
 ### Removed
 
 ### Fixed
diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py
diff --git a/openeo/extra/job_management/_interface.py b/openeo/extra/job_management/_interface.py
@@ -6,7 +6,8 @@
 
 class JobDatabaseInterface(metaclass=abc.ABCMeta):
     """
-    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
+    Interface for a database of job metadata to use with the
+    :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager`,
     allowing to regularly persist the job metadata while polling the job statuses
     and resume/restart the job tracking after it was interrupted.
 
diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py
@@ -7,7 +7,7 @@
 import shapely.errors
 import shapely.wkt
 
-from openeo.extra.job_management._df_schema import _COLUMN_REQUIREMENTS, _normalize
+import openeo.extra.job_management._manager
 from openeo.extra.job_management._interface import JobDatabaseInterface
 
 _log = logging.getLogger(__name__)
@@ -22,7 +22,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
         """
         Initialize the job database from a given dataframe,
         which will be first normalized to be compatible
-        with :py:class:`MultiBackendJobManager` usage.
+        with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager` usage.
 
         :param df: dataframe with some columns your ``start_job`` callable expects
         :param on_exists: what to do when the job database already exists (persisted on disk):
@@ -42,7 +42,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
             else:
                 # TODO handle other on_exists modes: e.g. overwrite, merge, ...
                 raise ValueError(f"Invalid on_exists={on_exists!r}")
-        df = _normalize(df)
+        df = openeo.extra.job_management._manager.MultiBackendJobManager._column_requirements.normalize_df(df)
         self.persist(df)
         # Return self to allow chaining with constructor.
         return self
@@ -104,7 +104,7 @@ class CsvJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a CSV file.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface`
     :param path: Path to local CSV file.
 
     .. note::
@@ -135,7 +135,7 @@ def read(self) -> pd.DataFrame:
         df = pd.read_csv(
             self.path,
             # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
-            dtype={c: r.dtype for (c, r) in _COLUMN_REQUIREMENTS.items()},
+            dtype=openeo.extra.job_management._manager.MultiBackendJobManager._column_requirements.dtype_mapping(),
         )
         if (
             "geometry" in df.columns
@@ -159,7 +159,7 @@ class ParquetJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a Parquet file.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface`
     :param path: Path to the Parquet file.
 
     .. note::
diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py
@@ -1,5 +1,6 @@
 import collections
 import contextlib
+import dataclasses
 import datetime
 import json
 import logging
@@ -12,6 +13,7 @@
     Callable,
     Dict,
     List,
+    Mapping,
     NamedTuple,
     Optional,
     Tuple,
@@ -23,10 +25,10 @@
 from requests.adapters import HTTPAdapter
 from urllib3.util import Retry
 
+# TODO avoid this (circular) dependency on _job_db?
+import openeo.extra.job_management._job_db
 from openeo import BatchJob, Connection
-from openeo.extra.job_management._df_schema import _normalize
 from openeo.extra.job_management._interface import JobDatabaseInterface
-from openeo.extra.job_management._job_db import get_job_db
 from openeo.extra.job_management._thread_worker import (
     _JobManagerWorkerThreadPool,
     _JobStartTask,
@@ -38,6 +40,7 @@
 _log = logging.getLogger(__name__)
 
 
+# TODO: eliminate this module constant (should be part of some constructor interface)
 MAX_RETRIES = 50
 
 
@@ -58,6 +61,45 @@ class _Backend(NamedTuple):
     parallel_jobs: int
 
 
+@dataclasses.dataclass(frozen=True)
+class _ColumnProperties:
+    """Expected/required properties of a column in the job manager related dataframes"""
+
+    dtype: str = "object"
+    default: Any = None
+
+
+class _ColumnRequirements:
+    """
+    Helper class to encapsulate the column requirements expected by MultiBackendJobManager.
+    The current implementation (e.g. _job_db) has some undesired coupling here,
+    but it turns out quite hard to eliminate.
+    The goal of this class is, currently, to at least make the coupling explicit
+    in a centralized way.
+    """
+
+    def __init__(self, requirements: Mapping[str, _ColumnProperties]):
+        self._requirements = dict(requirements)
+
+    def normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Normalize given pandas dataframe (creating a new one):
+        ensure we have the required columns.
+
+        :param df: The dataframe to normalize.
+        :return: a new dataframe that is normalized.
+        """
+        new_columns = {col: req.default for (col, req) in self._requirements.items() if col not in df.columns}
+        df = df.assign(**new_columns)
+        return df
+
+    def dtype_mapping(self) -> Dict[str, str]:
+        """
+        Get mapping of column name to expected dtype string, e.g. to be used with pandas.read_csv(dtype=...)
+        """
+        return {col: req.dtype for (col, req) in self._requirements.items()}
+
+
 class MultiBackendJobManager:
     """
     Tracker for multiple jobs on multiple backends.
@@ -125,6 +167,27 @@ def start_job(
         Added ``cancel_running_job_after`` parameter.
     """
 
+    # Expected columns in the job DB dataframes.
+    # TODO: make this part of public API when settled?
+    # TODO: move non official statuses to separate column (not_started, queued_for_start)
+    _column_requirements: _ColumnRequirements = _ColumnRequirements(
+        {
+            "id": _ColumnProperties(dtype="str"),
+            "backend_name": _ColumnProperties(dtype="str"),
+            "status": _ColumnProperties(dtype="str", default="not_started"),
+            # TODO: use proper date/time dtype instead of legacy str for start times?
+            "start_time": _ColumnProperties(dtype="str"),
+            "running_start_time": _ColumnProperties(dtype="str"),
+            # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
+            #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
+            #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
+            "cpu": _ColumnProperties(dtype="str"),
+            "memory": _ColumnProperties(dtype="str"),
+            "duration": _ColumnProperties(dtype="str"),
+            "costs": _ColumnProperties(dtype="float64"),
+        }
+    )
+
     def __init__(
         self,
         poll_sleep: int = 60,
@@ -227,13 +290,9 @@ def _make_resilient(connection):
     @classmethod
     def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Normalize given pandas dataframe (creating a new one):
-        ensure we have the required columns.
-
-        :param df: The dataframe to normalize.
-        :return: a new dataframe that is normalized.
+        Deprecated, but kept for backwards compatibility
         """
-        return _normalize(df)
+        return cls._column_requirements.normalize_df(df)
 
     def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface):
         """
@@ -268,7 +327,7 @@ def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabas
         :param job_db:
             Job database to load/store existing job status data and other metadata from/to.
             Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+            or as a custom database object following the :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` interface.
 
             .. note::
                 Support for Parquet files depends on the ``pyarrow`` package
@@ -373,7 +432,7 @@ def run_jobs(
         :param job_db:
             Job database to load/store existing job status data and other metadata from/to.
             Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+            or as a custom database object following the :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` interface.
 
             .. note::
                 Support for Parquet files depends on the ``pyarrow`` package
@@ -389,7 +448,7 @@ def run_jobs(
         .. versionchanged:: 0.31.0
             Replace ``output_file`` argument with ``job_db`` argument,
             which can be a path to a CSV or Parquet file,
-            or a user-defined :py:class:`JobDatabaseInterface` object.
+            or a user-defined :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` object.
             The deprecated ``output_file`` argument is still supported for now.
 
         .. versionchanged:: 0.33.0
@@ -408,7 +467,7 @@ def run_jobs(
         assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}"
 
         if isinstance(job_db, (str, Path)):
-            job_db = get_job_db(path=job_db)  # TODO circular import
+            job_db = openeo.extra.job_management._job_db.get_job_db(path=job_db)
 
         if not isinstance(job_db, JobDatabaseInterface):
             raise ValueError(f"Unsupported job_db {job_db!r}")
diff --git a/openeo/extra/job_management/stac_job_db.py b/openeo/extra/job_management/stac_job_db.py
@@ -22,7 +22,7 @@ class STACAPIJobDatabase(JobDatabaseInterface):
 
     Unstable API, subject to change.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface`
     """
 
     def __init__(
@@ -56,10 +56,10 @@ def exists(self) -> bool:
 
     def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Normalize the given dataframe to be compatible with :py:class:`MultiBackendJobManager`
+        Normalize the given dataframe to be compatible with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager`
         by adding the default columns and setting the index.
         """
-        df = MultiBackendJobManager._normalize_df(df)
+        df = MultiBackendJobManager._column_requirements.normalize_df(df)
         # If the user doesn't specify the item_id column, we will use the index.
         if "item_id" not in df.columns:
             df = df.reset_index(names=["item_id"])
@@ -69,7 +69,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
         """
         Initialize the job database from a given dataframe,
         which will be first normalized to be compatible
-        with :py:class:`MultiBackendJobManager` usage.
+        with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager` usage.
 
         :param df: dataframe with some columns your ``start_job`` callable expects
         :param on_exists: what to do when the job database already exists (persisted on disk):
diff --git a/tests/extra/job_management/test_job_db.py b/tests/extra/job_management/test_job_db.py
@@ -1,13 +1,6 @@
 import re
 
 import geopandas
-
-# TODO: can we avoid using httpretty?
-#   We need it for testing the resilience, which uses an HTTPadapter with Retry
-#   but requests-mock also uses an HTTPAdapter for the mocking and basically
-#   erases the HTTPAdapter we have set up.
-#   httpretty avoids this specific problem because it mocks at the socket level,
-#   But I would rather not have two dependencies with almost the same goal.
 import pandas as pd
 import pytest
 import shapely.geometry
diff --git a/tests/extra/job_management/test_manager.py b/tests/extra/job_management/test_manager.py
@@ -31,10 +31,7 @@
     ParquetJobDatabase,
     create_job_db,
 )
-from openeo.extra.job_management._manager import (
-    MAX_RETRIES,
-    MultiBackendJobManager,
-)
+from openeo.extra.job_management._manager import MAX_RETRIES, MultiBackendJobManager
 from openeo.extra.job_management._thread_worker import (
     Task,
     _JobManagerWorkerThreadPool,
@@ -296,7 +293,7 @@ def test_start_job_thread_basic(self, tmp_path, job_manager, job_manager_root_di
 
     def test_normalize_df(self):
         df = pd.DataFrame({"some_number": [3, 2, 1]})
-        df_normalized = MultiBackendJobManager._normalize_df(df)
+        df_normalized = MultiBackendJobManager._column_requirements.normalize_df(df)
         assert set(df_normalized.columns) == set(
             [
                 "some_number",
@@ -603,9 +600,7 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to not actually sleep, but skip one hour at a time
-        with mock.patch.object(
-            openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)
-        ):
+        with mock.patch("time.sleep", new=lambda s: time_machine.shift(60 * 60)):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()
@@ -724,9 +719,7 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to skip one hour at a time instead of actually sleeping
-        with mock.patch.object(
-            openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)
-        ):
+        with mock.patch("time.sleep", new=lambda s: time_machine.shift(60 * 60)):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()