From 733e77a7c1345c3ef5223040ecc3241edda0fc16 Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Thu, 2 Oct 2025 15:15:16 +0200
Subject: [PATCH 01/10] moving code

---
 openeo/extra/job_management/__init__.py       | 271 +-----------------
 .../extra/job_management/dataframe_job_db.py  | 267 +++++++++++++++++
 .../process_based_job_creator.py              | 198 +++++++++++++
 3 files changed, 471 insertions(+), 265 deletions(-)
 create mode 100644 openeo/extra/job_management/dataframe_job_db.py
 create mode 100644 openeo/extra/job_management/process_based_job_creator.py

diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
index 9dfef353c..61ada5db8 100644
--- a/openeo/extra/job_management/__init__.py
+++ b/openeo/extra/job_management/__init__.py
@@ -1,4 +1,3 @@
-import abc
 import collections
 import contextlib
 import dataclasses
@@ -13,7 +12,6 @@
     Any,
     Callable,
     Dict,
-    Iterable,
     List,
     Mapping,
     NamedTuple,
@@ -24,9 +22,6 @@
 
 import pandas as pd
 import requests
-import shapely.errors
-import shapely.geometry.base
-import shapely.wkt
 from requests.adapters import HTTPAdapter
 from urllib3.util import Retry
 
@@ -35,24 +30,22 @@
     _JobManagerWorkerThreadPool,
     _JobStartTask,
 )
-from openeo.extra.job_management.process_based import ProcessBasedJobCreator
+from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator
+from openeo.extra.job_management.dataframe_job_db import JobDatabaseInterface, FullDataFrameJobDatabase, ParquetJobDatabase, CsvJobDatabase
+
 from openeo.rest import OpenEoApiError
 from openeo.rest.auth.auth import BearerAuth
-from openeo.util import deep_get, rfc3339
+from openeo.util import  deep_get, rfc3339
+
+_log = logging.getLogger(__name__)
 
 __all__ = [
     "JobDatabaseInterface",
     "FullDataFrameJobDatabase",
     "ParquetJobDatabase",
     "CsvJobDatabase",
-    "ProcessBasedJobCreator",
-    "create_job_db",
-    "get_job_db",
 ]
 
-_log = logging.getLogger(__name__)
-
-
 class _Backend(NamedTuple):
     """Container for backend info/settings"""
 
@@ -68,67 +61,6 @@ class _Backend(NamedTuple):
 _UNSET = object()
 
 
-class JobDatabaseInterface(metaclass=abc.ABCMeta):
-    """
-    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
-    allowing to regularly persist the job metadata while polling the job statuses
-    and resume/restart the job tracking after it was interrupted.
-
-    .. versionadded:: 0.31.0
-    """
-
-    @abc.abstractmethod
-    def exists(self) -> bool:
-        """Does the job database already exist, to read job data from?"""
-        ...
-
-    @abc.abstractmethod
-    def persist(self, df: pd.DataFrame):
-        """
-        Store (now or updated) job data to the database.
-
-        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
-        so it should be merged with the existing data (if any) instead of overwriting it completely.
-
-        :param df: job data to store.
-        """
-        ...
-
-    @abc.abstractmethod
-    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
-        """
-        Retrieve the number of jobs per status.
-
-        :param statuses: List/set of statuses to include. If empty, all statuses are included.
-
-        :return: dictionary with status as key and the count as value.
-        """
-        ...
-
-    @abc.abstractmethod
-    def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
-        """
-        Returns a dataframe with jobs, filtered by status.
-
-        :param statuses: List of statuses to include.
-        :param max: Maximum number of jobs to return.
-
-        :return: DataFrame with jobs filtered by status.
-        """
-        ...
-
-    @abc.abstractmethod
-    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
-        """
-        Returns a dataframe with jobs based on their (dataframe) index
-
-        :param indices: List of indices to include.
-
-        :return: DataFrame with jobs filtered by indices.
-        """
-        ...
-
-
 def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
     raise NotImplementedError("No 'start_job' callable provided")
 
@@ -930,197 +862,6 @@ def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5):
         time.sleep(sleep)
 
 
-class FullDataFrameJobDatabase(JobDatabaseInterface):
-    def __init__(self):
-        super().__init__()
-        self._df = None
-
-    def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
-        """
-        Initialize the job database from a given dataframe,
-        which will be first normalized to be compatible
-        with :py:class:`MultiBackendJobManager` usage.
-
-        :param df: dataframe with some columns your ``start_job`` callable expects
-        :param on_exists: what to do when the job database already exists (persisted on disk):
-            - "error": (default) raise an exception
-            - "skip": work with existing database, ignore given dataframe and skip any initialization
-
-        :return: initialized job database.
-
-        .. versionadded:: 0.33.0
-        """
-        # TODO: option to provide custom MultiBackendJobManager subclass with custom normalize?
-        if self.exists():
-            if on_exists == "skip":
-                return self
-            elif on_exists == "error":
-                raise FileExistsError(f"Job database {self!r} already exists.")
-            else:
-                # TODO handle other on_exists modes: e.g. overwrite, merge, ...
-                raise ValueError(f"Invalid on_exists={on_exists!r}")
-        df = MultiBackendJobManager._normalize_df(df)
-        self.persist(df)
-        # Return self to allow chaining with constructor.
-        return self
-
-    @abc.abstractmethod
-    def read(self) -> pd.DataFrame:
-        """
-        Read job data from the database as pandas DataFrame.
-
-        :return: loaded job data.
-        """
-        ...
-
-    @property
-    def df(self) -> pd.DataFrame:
-        if self._df is None:
-            self._df = self.read()
-        return self._df
-
-    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
-        status_histogram = self.df.groupby("status").size().to_dict()
-        statuses = set(statuses)
-        if statuses:
-            status_histogram = {k: v for k, v in status_histogram.items() if k in statuses}
-        return status_histogram
-
-    def get_by_status(self, statuses, max=None) -> pd.DataFrame:
-        """
-        Returns a dataframe with jobs, filtered by status.
-
-        :param statuses: List of statuses to include.
-        :param max: Maximum number of jobs to return.
-
-        :return: DataFrame with jobs filtered by status.
-        """
-        df = self.df
-        filtered = df[df.status.isin(statuses)]
-        return filtered.head(max) if max is not None else filtered
-
-    def _merge_into_df(self, df: pd.DataFrame):
-        if self._df is not None:
-            unknown_indices = set(df.index).difference(df.index)
-            if unknown_indices:
-                _log.warning(f"Merging DataFrame with {unknown_indices=} which will be lost.")
-            self._df.update(df, overwrite=True)
-        else:
-            self._df = df
-
-    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
-        indices = set(indices)
-        known = indices.intersection(self.df.index)
-        unknown = indices.difference(self.df.index)
-        if unknown:
-            _log.warning(f"Ignoring unknown DataFrame indices {unknown}")
-        return self._df.loc[list(known)]
-
-
-class CsvJobDatabase(FullDataFrameJobDatabase):
-    """
-    Persist/load job metadata with a CSV file.
-
-    :implements: :py:class:`JobDatabaseInterface`
-    :param path: Path to local CSV file.
-
-    .. note::
-        Support for GeoPandas dataframes depends on the ``geopandas`` package
-        as :ref:`optional dependency <installation-optional-dependencies>`.
-
-    .. versionadded:: 0.31.0
-    """
-
-    def __init__(self, path: Union[str, Path]):
-        super().__init__()
-        self.path = Path(path)
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}({str(self.path)!r})"
-
-    def exists(self) -> bool:
-        return self.path.exists()
-
-    def _is_valid_wkt(self, wkt: str) -> bool:
-        try:
-            shapely.wkt.loads(wkt)
-            return True
-        except shapely.errors.WKTReadingError:
-            return False
-
-    def read(self) -> pd.DataFrame:
-        df = pd.read_csv(
-            self.path,
-            # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
-            dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()},
-        )
-        if (
-            "geometry" in df.columns
-            and df["geometry"].dtype.name != "geometry"
-            and self._is_valid_wkt(df["geometry"].iloc[0])
-        ):
-            import geopandas
-
-            # `df.to_csv()` in `persist()` has encoded geometries as WKT, so we decode that here.
-            df.geometry = geopandas.GeoSeries.from_wkt(df["geometry"])
-            df = geopandas.GeoDataFrame(df)
-        return df
-
-    def persist(self, df: pd.DataFrame):
-        self._merge_into_df(df)
-        self.path.parent.mkdir(parents=True, exist_ok=True)
-        self.df.to_csv(self.path, index=False)
-
-
-class ParquetJobDatabase(FullDataFrameJobDatabase):
-    """
-    Persist/load job metadata with a Parquet file.
-
-    :implements: :py:class:`JobDatabaseInterface`
-    :param path: Path to the Parquet file.
-
-    .. note::
-        Support for Parquet files depends on the ``pyarrow`` package
-        as :ref:`optional dependency <installation-optional-dependencies>`.
-
-        Support for GeoPandas dataframes depends on the ``geopandas`` package
-        as :ref:`optional dependency <installation-optional-dependencies>`.
-
-    .. versionadded:: 0.31.0
-    """
-
-    def __init__(self, path: Union[str, Path]):
-        super().__init__()
-        self.path = Path(path)
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}({str(self.path)!r})"
-
-    def exists(self) -> bool:
-        return self.path.exists()
-
-    def read(self) -> pd.DataFrame:
-        # Unfortunately, a naive `pandas.read_parquet()` does not easily allow
-        # reconstructing geometries from a GeoPandas Parquet file.
-        # And vice-versa, `geopandas.read_parquet()` does not support reading
-        # Parquet file without geometries.
-        # So we have to guess which case we have.
-        # TODO is there a cleaner way to do this?
-        import pyarrow.parquet
-
-        metadata = pyarrow.parquet.read_metadata(self.path)
-        if b"geo" in metadata.metadata:
-            import geopandas
-
-            return geopandas.read_parquet(self.path)
-        else:
-            return pd.read_parquet(self.path)
-
-    def persist(self, df: pd.DataFrame):
-        self._merge_into_df(df)
-        self.path.parent.mkdir(parents=True, exist_ok=True)
-        self.df.to_parquet(self.path, index=False)
-
 
 def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
     """
diff --git a/openeo/extra/job_management/dataframe_job_db.py b/openeo/extra/job_management/dataframe_job_db.py
new file mode 100644
index 000000000..39709cf50
--- /dev/null
+++ b/openeo/extra/job_management/dataframe_job_db.py
@@ -0,0 +1,267 @@
+import abc
+import logging
+from pathlib import Path
+from typing import (
+    Iterable,
+    List,
+    Union,
+)
+
+import pandas as pd
+import shapely.errors
+import shapely.wkt
+
+_log = logging.getLogger(__name__)
+from openeo.extra.job_management import MultiBackendJobManager
+
+class JobDatabaseInterface(metaclass=abc.ABCMeta):
+    """
+    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
+    allowing to regularly persist the job metadata while polling the job statuses
+    and resume/restart the job tracking after it was interrupted.
+
+    .. versionadded:: 0.31.0
+    """
+
+    @abc.abstractmethod
+    def exists(self) -> bool:
+        """Does the job database already exist, to read job data from?"""
+        ...
+
+    @abc.abstractmethod
+    def persist(self, df: pd.DataFrame):
+        """
+        Store (now or updated) job data to the database.
+
+        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
+        so it should be merged with the existing data (if any) instead of overwriting it completely.
+
+        :param df: job data to store.
+        """
+        ...
+
+    @abc.abstractmethod
+    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
+        """
+        Retrieve the number of jobs per status.
+
+        :param statuses: List/set of statuses to include. If empty, all statuses are included.
+
+        :return: dictionary with status as key and the count as value.
+        """
+        ...
+
+    @abc.abstractmethod
+    def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs, filtered by status.
+
+        :param statuses: List of statuses to include.
+        :param max: Maximum number of jobs to return.
+
+        :return: DataFrame with jobs filtered by status.
+        """
+        ...
+
+    @abc.abstractmethod
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs based on their (dataframe) index
+
+        :param indices: List of indices to include.
+
+        :return: DataFrame with jobs filtered by indices.
+        """
+        ...
+
+class FullDataFrameJobDatabase(JobDatabaseInterface):
+    def __init__(self):
+        super().__init__()
+        self._df = None
+
+    def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
+        """
+        Initialize the job database from a given dataframe,
+        which will be first normalized to be compatible
+        with :py:class:`MultiBackendJobManager` usage.
+
+        :param df: dataframe with some columns your ``start_job`` callable expects
+        :param on_exists: what to do when the job database already exists (persisted on disk):
+            - "error": (default) raise an exception
+            - "skip": work with existing database, ignore given dataframe and skip any initialization
+
+        :return: initialized job database.
+
+        .. versionadded:: 0.33.0
+        """
+        # TODO: option to provide custom MultiBackendJobManager subclass with custom normalize?
+        if self.exists():
+            if on_exists == "skip":
+                return self
+            elif on_exists == "error":
+                raise FileExistsError(f"Job database {self!r} already exists.")
+            else:
+                # TODO handle other on_exists modes: e.g. overwrite, merge, ...
+                raise ValueError(f"Invalid on_exists={on_exists!r}")
+        df = MultiBackendJobManager._normalize_df(df)
+        self.persist(df)
+        # Return self to allow chaining with constructor.
+        return self
+
+    @abc.abstractmethod
+    def read(self) -> pd.DataFrame:
+        """
+        Read job data from the database as pandas DataFrame.
+
+        :return: loaded job data.
+        """
+        ...
+
+    @property
+    def df(self) -> pd.DataFrame:
+        if self._df is None:
+            self._df = self.read()
+        return self._df
+
+    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
+        status_histogram = self.df.groupby("status").size().to_dict()
+        statuses = set(statuses)
+        if statuses:
+            status_histogram = {k: v for k, v in status_histogram.items() if k in statuses}
+        return status_histogram
+
+    def get_by_status(self, statuses, max=None) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs, filtered by status.
+
+        :param statuses: List of statuses to include.
+        :param max: Maximum number of jobs to return.
+
+        :return: DataFrame with jobs filtered by status.
+        """
+        df = self.df
+        filtered = df[df.status.isin(statuses)]
+        return filtered.head(max) if max is not None else filtered
+
+    def _merge_into_df(self, df: pd.DataFrame):
+        if self._df is not None:
+            unknown_indices = set(df.index).difference(df.index)
+            if unknown_indices:
+                _log.warning(f"Merging DataFrame with {unknown_indices=} which will be lost.")
+            self._df.update(df, overwrite=True)
+        else:
+            self._df = df
+
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        indices = set(indices)
+        known = indices.intersection(self.df.index)
+        unknown = indices.difference(self.df.index)
+        if unknown:
+            _log.warning(f"Ignoring unknown DataFrame indices {unknown}")
+        return self._df.loc[list(known)]
+
+
+class CsvJobDatabase(FullDataFrameJobDatabase):
+    """
+    Persist/load job metadata with a CSV file.
+
+    :implements: :py:class:`JobDatabaseInterface`
+    :param path: Path to local CSV file.
+
+    .. note::
+        Support for GeoPandas dataframes depends on the ``geopandas`` package
+        as :ref:`optional dependency <installation-optional-dependencies>`.
+
+    .. versionadded:: 0.31.0
+    """
+
+    def __init__(self, path: Union[str, Path]):
+        super().__init__()
+        self.path = Path(path)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({str(self.path)!r})"
+
+    def exists(self) -> bool:
+        return self.path.exists()
+
+    def _is_valid_wkt(self, wkt: str) -> bool:
+        try:
+            shapely.wkt.loads(wkt)
+            return True
+        except shapely.errors.WKTReadingError:
+            return False
+
+    def read(self) -> pd.DataFrame:
+        df = pd.read_csv(
+            self.path,
+            # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
+            dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()},
+        )
+        if (
+            "geometry" in df.columns
+            and df["geometry"].dtype.name != "geometry"
+            and self._is_valid_wkt(df["geometry"].iloc[0])
+        ):
+            import geopandas
+
+            # `df.to_csv()` in `persist()` has encoded geometries as WKT, so we decode that here.
+            df.geometry = geopandas.GeoSeries.from_wkt(df["geometry"])
+            df = geopandas.GeoDataFrame(df)
+        return df
+
+    def persist(self, df: pd.DataFrame):
+        self._merge_into_df(df)
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        self.df.to_csv(self.path, index=False)
+
+class ParquetJobDatabase(FullDataFrameJobDatabase):
+    """
+    Persist/load job metadata with a Parquet file.
+
+    :implements: :py:class:`JobDatabaseInterface`
+    :param path: Path to the Parquet file.
+
+    .. note::
+        Support for Parquet files depends on the ``pyarrow`` package
+        as :ref:`optional dependency <installation-optional-dependencies>`.
+
+        Support for GeoPandas dataframes depends on the ``geopandas`` package
+        as :ref:`optional dependency <installation-optional-dependencies>`.
+
+    .. versionadded:: 0.31.0
+    """
+
+    def __init__(self, path: Union[str, Path]):
+        super().__init__()
+        self.path = Path(path)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({str(self.path)!r})"
+
+    def exists(self) -> bool:
+        return self.path.exists()
+
+    def read(self) -> pd.DataFrame:
+        # Unfortunately, a naive `pandas.read_parquet()` does not easily allow
+        # reconstructing geometries from a GeoPandas Parquet file.
+        # And vice-versa, `geopandas.read_parquet()` does not support reading
+        # Parquet file without geometries.
+        # So we have to guess which case we have.
+        # TODO is there a cleaner way to do this?
+        import pyarrow.parquet
+
+        metadata = pyarrow.parquet.read_metadata(self.path)
+        if b"geo" in metadata.metadata:
+            import geopandas
+
+            return geopandas.read_parquet(self.path)
+        else:
+            return pd.read_parquet(self.path)
+
+    def persist(self, df: pd.DataFrame):
+        self._merge_into_df(df)
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        self.df.to_parquet(self.path, index=False)
+
+
diff --git a/openeo/extra/job_management/process_based_job_creator.py b/openeo/extra/job_management/process_based_job_creator.py
new file mode 100644
index 000000000..a75531b6b
--- /dev/null
+++ b/openeo/extra/job_management/process_based_job_creator.py
@@ -0,0 +1,198 @@
+
+import logging
+import re
+from typing import (
+    List,
+    Optional,
+    Union,
+)
+import numpy
+import pandas as pd
+import shapely.geometry.base
+
+from openeo import BatchJob, Connection
+from openeo.internal.processes.parse import (
+    Parameter,
+    Process,
+    parse_remote_process_definition,
+)
+
+from openeo.util import LazyLoadCache, repr_truncate
+_log = logging.getLogger(__name__)
+
+
+class ProcessBasedJobCreator:
+    """
+    Batch job creator
+    (to be used together with :py:class:`MultiBackendJobManager`)
+    that takes a parameterized openEO process definition
+    (e.g a user-defined process (UDP) or a remote openEO process definition),
+    and creates a batch job
+    for each row of the dataframe managed by the :py:class:`MultiBackendJobManager`
+    by filling in the process parameters with corresponding row values.
+
+    .. seealso::
+        See :ref:`job-management-with-process-based-job-creator`
+        for more information and examples.
+
+    Process parameters are linked to dataframe columns by name.
+    While this intuitive name-based matching should cover most use cases,
+    there are additional options for overrides or fallbacks:
+
+    -   When provided, ``parameter_column_map`` will be consulted
+        for resolving a process parameter name (key in the dictionary)
+        to a desired dataframe column name (corresponding value).
+    -   One common case is handled automatically as convenience functionality.
+
+        When:
+
+        - ``parameter_column_map`` is not provided (or set to ``None``),
+        - and there is a *single parameter* that accepts inline GeoJSON geometries,
+        - and the dataframe is a GeoPandas dataframe with a *single geometry* column,
+
+        then this parameter and this geometries column will be linked automatically.
+
+    -   If a parameter can not be matched with a column by name as described above,
+        a default value will be picked,
+        first by looking in ``parameter_defaults`` (if provided),
+        and then by looking up the default value from the parameter schema in the process definition.
+    -   Finally if no (default) value can be determined and the parameter
+        is not flagged as optional, an error will be raised.
+
+
+    :param process_id: (optional) openEO process identifier.
+        Can be omitted when working with a remote process definition
+        that is fully defined with a URL in the ``namespace`` parameter.
+    :param namespace: (optional) openEO process namespace.
+        Typically used to provide a URL to a remote process definition.
+    :param parameter_defaults: (optional) default values for process parameters,
+        to be used when not available in the dataframe managed by
+        :py:class:`MultiBackendJobManager`.
+    :param parameter_column_map: Optional overrides
+        for linking process parameters to dataframe columns:
+        mapping of process parameter names as key
+        to dataframe column names as value.
+
+    .. versionadded:: 0.33.0
+
+    .. warning::
+        This is an experimental API subject to change,
+        and we greatly welcome
+        `feedback and suggestions for improvement <https://github.com/Open-EO/openeo-python-client/issues>`_.
+
+    """
+
+    def __init__(
+        self,
+        *,
+        process_id: Optional[str] = None,
+        namespace: Union[str, None] = None,
+        parameter_defaults: Optional[dict] = None,
+        parameter_column_map: Optional[dict] = None,
+    ):
+        if process_id is None and namespace is None:
+            raise ValueError("At least one of `process_id` and `namespace` should be provided.")
+        self._process_id = process_id
+        self._namespace = namespace
+        self._parameter_defaults = parameter_defaults or {}
+        self._parameter_column_map = parameter_column_map
+        self._cache = LazyLoadCache()
+
+    def _get_process_definition(self, connection: Connection) -> Process:
+        if isinstance(self._namespace, str) and re.match("https?://", self._namespace):
+            # Remote process definition handling
+            return self._cache.get(
+                key=("remote_process_definition", self._namespace, self._process_id),
+                load=lambda: parse_remote_process_definition(namespace=self._namespace, process_id=self._process_id),
+            )
+        elif self._namespace is None:
+            # Handling of a user-specific UDP
+            udp_raw = connection.user_defined_process(self._process_id).describe()
+            return Process.from_dict(udp_raw)
+        else:
+            raise NotImplementedError(
+                f"Unsupported process definition source udp_id={self._process_id!r} namespace={self._namespace!r}"
+            )
+
+    def start_job(self, row: pd.Series, connection: Connection, **_) -> BatchJob:
+        """
+        Implementation of the ``start_job`` callable interface
+        of :py:meth:`MultiBackendJobManager.run_jobs`
+        to create a job based on given dataframe row
+
+        :param row: The row in the pandas dataframe that stores the jobs state and other tracked data.
+        :param connection: The connection to the backend.
+        """
+        # TODO: refactor out some methods, for better reuse and decoupling:
+        #       `get_arguments()` (to build the arguments dictionary), `get_cube()` (to create the cube),
+
+        process_definition = self._get_process_definition(connection=connection)
+        process_id = process_definition.id
+        parameters = process_definition.parameters or []
+
+        if self._parameter_column_map is None:
+            self._parameter_column_map = self._guess_parameter_column_map(parameters=parameters, row=row)
+
+        arguments = {}
+        for parameter in parameters:
+            param_name = parameter.name
+            column_name = self._parameter_column_map.get(param_name, param_name)
+            if column_name in row.index:
+                # Get value from dataframe row
+                value = row.loc[column_name]
+            elif param_name in self._parameter_defaults:
+                # Fallback on default values from constructor
+                value = self._parameter_defaults[param_name]
+            elif parameter.has_default():
+                # Explicitly use default value from parameter schema
+                value = parameter.default
+            elif parameter.optional:
+                # Skip optional parameters without any fallback default value
+                continue
+            else:
+                raise ValueError(f"Missing required parameter {param_name!r} for process {process_id!r}")
+
+            # Prepare some values/dtypes for JSON encoding
+            if isinstance(value, numpy.integer):
+                value = int(value)
+            elif isinstance(value, numpy.number):
+                value = float(value)
+            elif isinstance(value, shapely.geometry.base.BaseGeometry):
+                value = shapely.geometry.mapping(value)
+
+            arguments[param_name] = value
+
+        cube = connection.datacube_from_process(process_id=process_id, namespace=self._namespace, **arguments)
+
+        title = row.get("title", f"Process {process_id!r} with {repr_truncate(arguments)}")
+        description = row.get("description", f"Process {process_id!r} (namespace {self._namespace}) with {arguments}")
+        job = connection.create_job(cube, title=title, description=description)
+
+        return job
+
+    def __call__(self, *arg, **kwargs) -> BatchJob:
+        """Syntactic sugar for calling :py:meth:`start_job`."""
+        return self.start_job(*arg, **kwargs)
+
+    @staticmethod
+    def _guess_parameter_column_map(parameters: List[Parameter], row: pd.Series) -> dict:
+        """
+        Guess parameter-column mapping from given parameter list and dataframe row
+        """
+        parameter_column_map = {}
+        # Geometry based mapping: try to automatically map geometry columns to geojson parameters
+        geojson_parameters = [p.name for p in parameters if p.schema.accepts_geojson()]
+        geometry_columns = [i for (i, v) in row.items() if isinstance(v, shapely.geometry.base.BaseGeometry)]
+        if geojson_parameters and geometry_columns:
+            if len(geojson_parameters) == 1 and len(geometry_columns) == 1:
+                # Most common case: one geometry parameter and one geometry column: can be mapped naively
+                parameter_column_map[geojson_parameters[0]] = geometry_columns[0]
+            elif all(p in geometry_columns for p in geojson_parameters):
+                # Each geometry param has geometry column with same name: easy to map
+                parameter_column_map.update((p, p) for p in geojson_parameters)
+            else:
+                raise RuntimeError(
+                    f"Problem with mapping geometry columns ({geometry_columns}) to process parameters ({geojson_parameters})"
+                )
+        _log.debug(f"Guessed parameter-column map: {parameter_column_map}")
+        return parameter_column_map

From 2e5f7d32200687662218685b7e2b16c5c63383a6 Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Fri, 3 Oct 2025 14:37:25 +0200
Subject: [PATCH 02/10] remove circular dependency

---
 openeo/extra/job_management/__init__.py       | 41 ++++------
 .../extra/job_management/_dataframe_utils.py  | 33 ++++++++
 .../{dataframe_job_db.py => _job_database.py} | 77 +++++++++++++++++--
 .../job_management/test_job_management.py     |  2 +-
 4 files changed, 120 insertions(+), 33 deletions(-)
 create mode 100644 openeo/extra/job_management/_dataframe_utils.py
 rename openeo/extra/job_management/{dataframe_job_db.py => _job_database.py} (76%)

diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
index 61ada5db8..0a50dd955 100644
--- a/openeo/extra/job_management/__init__.py
+++ b/openeo/extra/job_management/__init__.py
@@ -13,7 +13,6 @@
     Callable,
     Dict,
     List,
-    Mapping,
     NamedTuple,
     Optional,
     Tuple,
@@ -31,7 +30,9 @@
     _JobStartTask,
 )
 from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator
-from openeo.extra.job_management.dataframe_job_db import JobDatabaseInterface, FullDataFrameJobDatabase, ParquetJobDatabase, CsvJobDatabase
+from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db
+from openeo.extra.job_management._dataframe_utils import normalize_dataframe
+
 
 from openeo.rest import OpenEoApiError
 from openeo.rest.auth.auth import BearerAuth
@@ -44,6 +45,10 @@
     "FullDataFrameJobDatabase",
     "ParquetJobDatabase",
     "CsvJobDatabase",
+    "ProcessBasedJobCreator",
+    "create_job_db",
+    "get_job_db",
+
 ]
 
 class _Backend(NamedTuple):
@@ -73,6 +78,8 @@ class _ColumnProperties:
     default: Any = None
 
 
+
+
 class MultiBackendJobManager:
     """
     Tracker for multiple jobs on multiple backends.
@@ -143,21 +150,7 @@ def start_job(
     # Expected columns in the job DB dataframes.
     # TODO: make this part of public API when settled?
     # TODO: move non official statuses to seperate column (not_started, queued_for_start)
-    _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
-        "id": _ColumnProperties(dtype="str"),
-        "backend_name": _ColumnProperties(dtype="str"),
-        "status": _ColumnProperties(dtype="str", default="not_started"),
-        # TODO: use proper date/time dtype instead of legacy str for start times?
-        "start_time": _ColumnProperties(dtype="str"),
-        "running_start_time": _ColumnProperties(dtype="str"),
-        # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
-        #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
-        #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
-        "cpu": _ColumnProperties(dtype="str"),
-        "memory": _ColumnProperties(dtype="str"),
-        "duration": _ColumnProperties(dtype="str"),
-        "costs": _ColumnProperties(dtype="float64"),
-    }
+
 
     def __init__(
         self,
@@ -260,17 +253,8 @@ def _make_resilient(connection):
 
     @classmethod
     def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
-        """
-        Normalize given pandas dataframe (creating a new one):
-        ensure we have the required columns.
-
-        :param df: The dataframe to normalize.
-        :return: a new dataframe that is normalized.
-        """
-        new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns}
-        df = df.assign(**new_columns)
 
-        return df
+        return normalize_dataframe(df)
 
     def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface):
         """
@@ -863,6 +847,7 @@ def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5):
 
 
 
+<<<<<<< HEAD
 def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
     """
     Factory to get a job database at a given path,
@@ -902,3 +887,5 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str =
     else:
         raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
     return job_db
+=======
+>>>>>>> 4825bbcd (remove circular dependency)
diff --git a/openeo/extra/job_management/_dataframe_utils.py b/openeo/extra/job_management/_dataframe_utils.py
new file mode 100644
index 000000000..151390171
--- /dev/null
+++ b/openeo/extra/job_management/_dataframe_utils.py
@@ -0,0 +1,33 @@
+import pandas as pd
+
+class _ColumnProperties:
+    def __init__(self, dtype: str, default=None):
+        self.dtype = dtype
+        self.default = default
+
+# Expected columns in the job DB dataframes.
+# TODO: make this part of public API when settled?
+# TODO: move non official statuses to seperate column (not_started, queued_for_start)
+COLUMN_REQUIREMENTS = {
+    "id": _ColumnProperties(dtype="str"),
+    "backend_name": _ColumnProperties(dtype="str"),
+    "status": _ColumnProperties(dtype="str", default="not_started"),
+    "start_time": _ColumnProperties(dtype="str"),
+    "running_start_time": _ColumnProperties(dtype="str"),
+    "cpu": _ColumnProperties(dtype="str"),
+    "memory": _ColumnProperties(dtype="str"),
+    "duration": _ColumnProperties(dtype="str"),
+    "costs": _ColumnProperties(dtype="float64"),
+}
+
+def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normalize given pandas dataframe (creating a new one):
+    ensure we have the required columns.
+
+    :param df: The dataframe to normalize.
+    :return: a new dataframe that is normalized.
+    """
+    new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns}
+    df = df.assign(**new_columns)
+    return df
\ No newline at end of file
diff --git a/openeo/extra/job_management/dataframe_job_db.py b/openeo/extra/job_management/_job_database.py
similarity index 76%
rename from openeo/extra/job_management/dataframe_job_db.py
rename to openeo/extra/job_management/_job_database.py
index 39709cf50..55c00a10f 100644
--- a/openeo/extra/job_management/dataframe_job_db.py
+++ b/openeo/extra/job_management/_job_database.py
@@ -3,16 +3,20 @@
 from pathlib import Path
 from typing import (
     Iterable,
-    List,
     Union,
+    List,
 )
 
+
 import pandas as pd
 import shapely.errors
 import shapely.wkt
 
+from openeo.extra.job_management._dataframe_utils import normalize_dataframe, COLUMN_REQUIREMENTS
+
+
+
 _log = logging.getLogger(__name__)
-from openeo.extra.job_management import MultiBackendJobManager
 
 class JobDatabaseInterface(metaclass=abc.ABCMeta):
     """
@@ -74,6 +78,8 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
         """
         ...
 
+        
+
 class FullDataFrameJobDatabase(JobDatabaseInterface):
     def __init__(self):
         super().__init__()
@@ -103,7 +109,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
             else:
                 # TODO handle other on_exists modes: e.g. overwrite, merge, ...
                 raise ValueError(f"Invalid on_exists={on_exists!r}")
-        df = MultiBackendJobManager._normalize_df(df)
+        df = normalize_dataframe(df)
         self.persist(df)
         # Return self to allow chaining with constructor.
         return self
@@ -161,6 +167,7 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
         return self._df.loc[list(known)]
 
 
+
 class CsvJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a CSV file.
@@ -196,7 +203,7 @@ def read(self) -> pd.DataFrame:
         df = pd.read_csv(
             self.path,
             # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
-            dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()},
+            dtype={c: r.dtype for (c, r) in COLUMN_REQUIREMENTS.items()},
         )
         if (
             "geometry" in df.columns
@@ -262,6 +269,66 @@ def read(self) -> pd.DataFrame:
     def persist(self, df: pd.DataFrame):
         self._merge_into_df(df)
         self.path.parent.mkdir(parents=True, exist_ok=True)
-        self.df.to_parquet(self.path, index=False)
+        self.df.to_parquet(self.path, index=False)    
+
+def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
+    """
+    Factory to create a job database at given path,
+    initialized from a given dataframe,
+    and its database type guessed from filename extension.
+
+    :param path: Path to the job database file.
+    :param df: DataFrame to store in the job database.
+    :param on_exists: What to do when the job database already exists:
+        - "error": (default) raise an exception
+        - "skip": work with existing database, ignore given dataframe and skip any initialization
 
+    .. versionadded:: 0.33.0
+    """
+    job_db = get_job_db(path)
+    if isinstance(job_db, FullDataFrameJobDatabase):
+        job_db.initialize_from_df(df=df, on_exists=on_exists)
+    else:
+        raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
+    return job_db
+
+def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
+    """
+    Factory to get a job database at a given path,
+    guessing the database type from filename extension.
+
+    :param path: path to job database file.
+
+    .. versionadded:: 0.33.0
+    """
+    path = Path(path)
+    if path.suffix.lower() in {".csv"}:
+        job_db = CsvJobDatabase(path=path)
+    elif path.suffix.lower() in {".parquet", ".geoparquet"}:
+        job_db = ParquetJobDatabase(path=path)
+    else:
+        raise ValueError(f"Could not guess job database type from {path!r}")
+    return job_db
+
+
+def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
+    """
+    Factory to create a job database at given path,
+    initialized from a given dataframe,
+    and its database type guessed from filename extension.
+
+    :param path: Path to the job database file.
+    :param df: DataFrame to store in the job database.
+    :param on_exists: What to do when the job database already exists:
+        - "error": (default) raise an exception
+        - "skip": work with existing database, ignore given dataframe and skip any initialization
+
+    .. versionadded:: 0.33.0
+    """
+    job_db = get_job_db(path)
+    if isinstance(job_db, FullDataFrameJobDatabase):
+        job_db.initialize_from_df(df=df, on_exists=on_exists)
+    else:
+        raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
+    return job_db
 
diff --git a/tests/extra/job_management/test_job_management.py b/tests/extra/job_management/test_job_management.py
index fab901548..58901c05a 100644
--- a/tests/extra/job_management/test_job_management.py
+++ b/tests/extra/job_management/test_job_management.py
@@ -869,7 +869,7 @@ def test_refresh_bearer_token_before_start(
         dummy_backend_bar,
         job_manager_root_dir,
         sleep_mock,
-        requests_mock,
+        requests_mock
     ):
 
         client_id = "client123"

From 0a6d03c6a600de345c3ae2e85a096f8bfc1e3c60 Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Fri, 3 Oct 2025 14:39:38 +0200
Subject: [PATCH 03/10] centralizing

---
 openeo/extra/job_management/__init__.py       |  4 +-
 .../extra/job_management/_dataframe_utils.py  | 33 ----------------
 openeo/extra/job_management/_job_database.py  | 38 +++++++++++++++++--
 3 files changed, 36 insertions(+), 39 deletions(-)
 delete mode 100644 openeo/extra/job_management/_dataframe_utils.py

diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
index 0a50dd955..1660ae0b5 100644
--- a/openeo/extra/job_management/__init__.py
+++ b/openeo/extra/job_management/__init__.py
@@ -30,9 +30,7 @@
     _JobStartTask,
 )
 from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator
-from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db
-from openeo.extra.job_management._dataframe_utils import normalize_dataframe
-
+from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db, normalize_dataframe
 
 from openeo.rest import OpenEoApiError
 from openeo.rest.auth.auth import BearerAuth
diff --git a/openeo/extra/job_management/_dataframe_utils.py b/openeo/extra/job_management/_dataframe_utils.py
deleted file mode 100644
index 151390171..000000000
--- a/openeo/extra/job_management/_dataframe_utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pandas as pd
-
-class _ColumnProperties:
-    def __init__(self, dtype: str, default=None):
-        self.dtype = dtype
-        self.default = default
-
-# Expected columns in the job DB dataframes.
-# TODO: make this part of public API when settled?
-# TODO: move non official statuses to seperate column (not_started, queued_for_start)
-COLUMN_REQUIREMENTS = {
-    "id": _ColumnProperties(dtype="str"),
-    "backend_name": _ColumnProperties(dtype="str"),
-    "status": _ColumnProperties(dtype="str", default="not_started"),
-    "start_time": _ColumnProperties(dtype="str"),
-    "running_start_time": _ColumnProperties(dtype="str"),
-    "cpu": _ColumnProperties(dtype="str"),
-    "memory": _ColumnProperties(dtype="str"),
-    "duration": _ColumnProperties(dtype="str"),
-    "costs": _ColumnProperties(dtype="float64"),
-}
-
-def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Normalize given pandas dataframe (creating a new one):
-    ensure we have the required columns.
-
-    :param df: The dataframe to normalize.
-    :return: a new dataframe that is normalized.
-    """
-    new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns}
-    df = df.assign(**new_columns)
-    return df
\ No newline at end of file
diff --git a/openeo/extra/job_management/_job_database.py b/openeo/extra/job_management/_job_database.py
index 55c00a10f..4be3fc1eb 100644
--- a/openeo/extra/job_management/_job_database.py
+++ b/openeo/extra/job_management/_job_database.py
@@ -12,12 +12,19 @@
 import shapely.errors
 import shapely.wkt
 
-from openeo.extra.job_management._dataframe_utils import normalize_dataframe, COLUMN_REQUIREMENTS
-
 
 
 _log = logging.getLogger(__name__)
 
+import pandas as pd
+
+class _ColumnProperties:
+    def __init__(self, dtype: str, default=None):
+        self.dtype = dtype
+        self.default = default
+
+
+
 class JobDatabaseInterface(metaclass=abc.ABCMeta):
     """
     Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
@@ -78,7 +85,20 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
         """
         ...
 
-        
+# Expected columns in the job DB dataframes.
+# TODO: make this part of public API when settled?
+# TODO: move non official statuses to seperate column (not_started, queued_for_start)
+COLUMN_REQUIREMENTS = {
+    "id": _ColumnProperties(dtype="str"),
+    "backend_name": _ColumnProperties(dtype="str"),
+    "status": _ColumnProperties(dtype="str", default="not_started"),
+    "start_time": _ColumnProperties(dtype="str"),
+    "running_start_time": _ColumnProperties(dtype="str"),
+    "cpu": _ColumnProperties(dtype="str"),
+    "memory": _ColumnProperties(dtype="str"),
+    "duration": _ColumnProperties(dtype="str"),
+    "costs": _ColumnProperties(dtype="float64"),
+}   
 
 class FullDataFrameJobDatabase(JobDatabaseInterface):
     def __init__(self):
@@ -271,6 +291,18 @@ def persist(self, df: pd.DataFrame):
         self.path.parent.mkdir(parents=True, exist_ok=True)
         self.df.to_parquet(self.path, index=False)    
 
+def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normalize given pandas dataframe (creating a new one):
+    ensure we have the required columns.
+
+    :param df: The dataframe to normalize.
+    :return: a new dataframe that is normalized.
+    """
+    new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns}
+    df = df.assign(**new_columns)
+    return df 
+
 def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
     """
     Factory to create a job database at given path,

From 57458a39b05c91d4b54508b1958efd5de3f8466b Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Tue, 4 Nov 2025 13:23:22 +0100
Subject: [PATCH 04/10] rebase

---
 openeo/extra/job_management/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
index 1660ae0b5..1f70cb686 100644
--- a/openeo/extra/job_management/__init__.py
+++ b/openeo/extra/job_management/__init__.py
@@ -845,7 +845,6 @@ def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5):
 
 
 
-<<<<<<< HEAD
 def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
     """
     Factory to get a job database at a given path,
@@ -885,5 +884,3 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str =
     else:
         raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
     return job_db
-=======
->>>>>>> 4825bbcd (remove circular dependency)

From c2f1ad1b587049ef109403cca37cc7e5521b1327 Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Tue, 4 Nov 2025 13:27:31 +0100
Subject: [PATCH 05/10] duplication

---
 .../process_based_job_creator.py              | 198 ------------------
 1 file changed, 198 deletions(-)
 delete mode 100644 openeo/extra/job_management/process_based_job_creator.py

diff --git a/openeo/extra/job_management/process_based_job_creator.py b/openeo/extra/job_management/process_based_job_creator.py
deleted file mode 100644
index a75531b6b..000000000
--- a/openeo/extra/job_management/process_based_job_creator.py
+++ /dev/null
@@ -1,198 +0,0 @@
-
-import logging
-import re
-from typing import (
-    List,
-    Optional,
-    Union,
-)
-import numpy
-import pandas as pd
-import shapely.geometry.base
-
-from openeo import BatchJob, Connection
-from openeo.internal.processes.parse import (
-    Parameter,
-    Process,
-    parse_remote_process_definition,
-)
-
-from openeo.util import LazyLoadCache, repr_truncate
-_log = logging.getLogger(__name__)
-
-
-class ProcessBasedJobCreator:
-    """
-    Batch job creator
-    (to be used together with :py:class:`MultiBackendJobManager`)
-    that takes a parameterized openEO process definition
-    (e.g a user-defined process (UDP) or a remote openEO process definition),
-    and creates a batch job
-    for each row of the dataframe managed by the :py:class:`MultiBackendJobManager`
-    by filling in the process parameters with corresponding row values.
-
-    .. seealso::
-        See :ref:`job-management-with-process-based-job-creator`
-        for more information and examples.
-
-    Process parameters are linked to dataframe columns by name.
-    While this intuitive name-based matching should cover most use cases,
-    there are additional options for overrides or fallbacks:
-
-    -   When provided, ``parameter_column_map`` will be consulted
-        for resolving a process parameter name (key in the dictionary)
-        to a desired dataframe column name (corresponding value).
-    -   One common case is handled automatically as convenience functionality.
-
-        When:
-
-        - ``parameter_column_map`` is not provided (or set to ``None``),
-        - and there is a *single parameter* that accepts inline GeoJSON geometries,
-        - and the dataframe is a GeoPandas dataframe with a *single geometry* column,
-
-        then this parameter and this geometries column will be linked automatically.
-
-    -   If a parameter can not be matched with a column by name as described above,
-        a default value will be picked,
-        first by looking in ``parameter_defaults`` (if provided),
-        and then by looking up the default value from the parameter schema in the process definition.
-    -   Finally if no (default) value can be determined and the parameter
-        is not flagged as optional, an error will be raised.
-
-
-    :param process_id: (optional) openEO process identifier.
-        Can be omitted when working with a remote process definition
-        that is fully defined with a URL in the ``namespace`` parameter.
-    :param namespace: (optional) openEO process namespace.
-        Typically used to provide a URL to a remote process definition.
-    :param parameter_defaults: (optional) default values for process parameters,
-        to be used when not available in the dataframe managed by
-        :py:class:`MultiBackendJobManager`.
-    :param parameter_column_map: Optional overrides
-        for linking process parameters to dataframe columns:
-        mapping of process parameter names as key
-        to dataframe column names as value.
-
-    .. versionadded:: 0.33.0
-
-    .. warning::
-        This is an experimental API subject to change,
-        and we greatly welcome
-        `feedback and suggestions for improvement <https://github.com/Open-EO/openeo-python-client/issues>`_.
-
-    """
-
-    def __init__(
-        self,
-        *,
-        process_id: Optional[str] = None,
-        namespace: Union[str, None] = None,
-        parameter_defaults: Optional[dict] = None,
-        parameter_column_map: Optional[dict] = None,
-    ):
-        if process_id is None and namespace is None:
-            raise ValueError("At least one of `process_id` and `namespace` should be provided.")
-        self._process_id = process_id
-        self._namespace = namespace
-        self._parameter_defaults = parameter_defaults or {}
-        self._parameter_column_map = parameter_column_map
-        self._cache = LazyLoadCache()
-
-    def _get_process_definition(self, connection: Connection) -> Process:
-        if isinstance(self._namespace, str) and re.match("https?://", self._namespace):
-            # Remote process definition handling
-            return self._cache.get(
-                key=("remote_process_definition", self._namespace, self._process_id),
-                load=lambda: parse_remote_process_definition(namespace=self._namespace, process_id=self._process_id),
-            )
-        elif self._namespace is None:
-            # Handling of a user-specific UDP
-            udp_raw = connection.user_defined_process(self._process_id).describe()
-            return Process.from_dict(udp_raw)
-        else:
-            raise NotImplementedError(
-                f"Unsupported process definition source udp_id={self._process_id!r} namespace={self._namespace!r}"
-            )
-
-    def start_job(self, row: pd.Series, connection: Connection, **_) -> BatchJob:
-        """
-        Implementation of the ``start_job`` callable interface
-        of :py:meth:`MultiBackendJobManager.run_jobs`
-        to create a job based on given dataframe row
-
-        :param row: The row in the pandas dataframe that stores the jobs state and other tracked data.
-        :param connection: The connection to the backend.
-        """
-        # TODO: refactor out some methods, for better reuse and decoupling:
-        #       `get_arguments()` (to build the arguments dictionary), `get_cube()` (to create the cube),
-
-        process_definition = self._get_process_definition(connection=connection)
-        process_id = process_definition.id
-        parameters = process_definition.parameters or []
-
-        if self._parameter_column_map is None:
-            self._parameter_column_map = self._guess_parameter_column_map(parameters=parameters, row=row)
-
-        arguments = {}
-        for parameter in parameters:
-            param_name = parameter.name
-            column_name = self._parameter_column_map.get(param_name, param_name)
-            if column_name in row.index:
-                # Get value from dataframe row
-                value = row.loc[column_name]
-            elif param_name in self._parameter_defaults:
-                # Fallback on default values from constructor
-                value = self._parameter_defaults[param_name]
-            elif parameter.has_default():
-                # Explicitly use default value from parameter schema
-                value = parameter.default
-            elif parameter.optional:
-                # Skip optional parameters without any fallback default value
-                continue
-            else:
-                raise ValueError(f"Missing required parameter {param_name!r} for process {process_id!r}")
-
-            # Prepare some values/dtypes for JSON encoding
-            if isinstance(value, numpy.integer):
-                value = int(value)
-            elif isinstance(value, numpy.number):
-                value = float(value)
-            elif isinstance(value, shapely.geometry.base.BaseGeometry):
-                value = shapely.geometry.mapping(value)
-
-            arguments[param_name] = value
-
-        cube = connection.datacube_from_process(process_id=process_id, namespace=self._namespace, **arguments)
-
-        title = row.get("title", f"Process {process_id!r} with {repr_truncate(arguments)}")
-        description = row.get("description", f"Process {process_id!r} (namespace {self._namespace}) with {arguments}")
-        job = connection.create_job(cube, title=title, description=description)
-
-        return job
-
-    def __call__(self, *arg, **kwargs) -> BatchJob:
-        """Syntactic sugar for calling :py:meth:`start_job`."""
-        return self.start_job(*arg, **kwargs)
-
-    @staticmethod
-    def _guess_parameter_column_map(parameters: List[Parameter], row: pd.Series) -> dict:
-        """
-        Guess parameter-column mapping from given parameter list and dataframe row
-        """
-        parameter_column_map = {}
-        # Geometry based mapping: try to automatically map geometry columns to geojson parameters
-        geojson_parameters = [p.name for p in parameters if p.schema.accepts_geojson()]
-        geometry_columns = [i for (i, v) in row.items() if isinstance(v, shapely.geometry.base.BaseGeometry)]
-        if geojson_parameters and geometry_columns:
-            if len(geojson_parameters) == 1 and len(geometry_columns) == 1:
-                # Most common case: one geometry parameter and one geometry column: can be mapped naively
-                parameter_column_map[geojson_parameters[0]] = geometry_columns[0]
-            elif all(p in geometry_columns for p in geojson_parameters):
-                # Each geometry param has geometry column with same name: easy to map
-                parameter_column_map.update((p, p) for p in geojson_parameters)
-            else:
-                raise RuntimeError(
-                    f"Problem with mapping geometry columns ({geometry_columns}) to process parameters ({geojson_parameters})"
-                )
-        _log.debug(f"Guessed parameter-column map: {parameter_column_map}")
-        return parameter_column_map

From 5f3211bb8103dfcc788979dc6750b8fd739aceaf Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Wed, 5 Nov 2025 14:26:00 +0100
Subject: [PATCH 06/10] splitting

---
 openeo/extra/job_management/__init__.py       | 879 +-----------------
 openeo/extra/job_management/_interface.py     |  75 ++
 .../{_job_database.py => _job_db.py}          | 142 +--
 openeo/extra/job_management/_manager.py       | 853 +++++++++++++++++
 4 files changed, 943 insertions(+), 1006 deletions(-)
 create mode 100644 openeo/extra/job_management/_interface.py
 rename openeo/extra/job_management/{_job_database.py => _job_db.py} (65%)
 create mode 100644 openeo/extra/job_management/_manager.py

diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
index 1f70cb686..dd42ac1a8 100644
--- a/openeo/extra/job_management/__init__.py
+++ b/openeo/extra/job_management/__init__.py
@@ -1,42 +1,7 @@
-import collections
-import contextlib
-import dataclasses
-import datetime
-import json
-import logging
-import time
-import warnings
-from pathlib import Path
-from threading import Thread
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    List,
-    NamedTuple,
-    Optional,
-    Tuple,
-    Union,
-)
-
-import pandas as pd
-import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util import Retry
-
-from openeo import BatchJob, Connection
-from openeo.extra.job_management._thread_worker import (
-    _JobManagerWorkerThreadPool,
-    _JobStartTask,
-)
-from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator
-from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db, normalize_dataframe
-
-from openeo.rest import OpenEoApiError
-from openeo.rest.auth.auth import BearerAuth
-from openeo.util import  deep_get, rfc3339
-
-_log = logging.getLogger(__name__)
+from openeo.extra.job_management._interface import JobDatabaseInterface
+from openeo.extra.job_management._job_db import FullDataFrameJobDatabase, CsvJobDatabase, ParquetJobDatabase, create_job_db, get_job_db
+from openeo.extra.job_management.process_based import ProcessBasedJobCreator
+from openeo.extra.job_management._manager import MultiBackendJobManager
 
 __all__ = [
     "JobDatabaseInterface",
@@ -46,841 +11,7 @@
     "ProcessBasedJobCreator",
     "create_job_db",
     "get_job_db",
-
+    "MultiBackendJobManager"
 ]
 
-class _Backend(NamedTuple):
-    """Container for backend info/settings"""
-
-    # callable to create a backend connection
-    get_connection: Callable[[], Connection]
-    # Maximum number of jobs to allow in parallel on a backend
-    parallel_jobs: int
-
-
-MAX_RETRIES = 50
-
-# Sentinel value to indicate that a parameter was not set
-_UNSET = object()
-
-
-def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
-    raise NotImplementedError("No 'start_job' callable provided")
-
-
-@dataclasses.dataclass(frozen=True)
-class _ColumnProperties:
-    """Expected/required properties of a column in the job manager related dataframes"""
-
-    dtype: str = "object"
-    default: Any = None
-
-
-
-
-class MultiBackendJobManager:
-    """
-    Tracker for multiple jobs on multiple backends.
-
-    Usage example:
-
-    .. code-block:: python
-
-        import logging
-        import pandas as pd
-        import openeo
-        from openeo.extra.job_management import MultiBackendJobManager
-
-        logging.basicConfig(
-            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-            level=logging.INFO
-        )
-
-        manager = MultiBackendJobManager()
-        manager.add_backend("foo", connection=openeo.connect("http://foo.test"))
-        manager.add_backend("bar", connection=openeo.connect("http://bar.test"))
-
-        jobs_df = pd.DataFrame(...)
-        output_file = "jobs.csv"
-
-        def start_job(
-            row: pd.Series,
-            connection: openeo.Connection,
-            **kwargs
-        ) -> openeo.BatchJob:
-            year = row["year"]
-            cube = connection.load_collection(
-                ...,
-                temporal_extent=[f"{year}-01-01", f"{year+1}-01-01"],
-            )
-            ...
-            return cube.create_job(...)
-
-        manager.run_jobs(df=jobs_df, start_job=start_job, output_file=output_file)
-
-    See :py:meth:`.run_jobs` for more information on the ``start_job`` callable.
-
-    :param poll_sleep:
-        How many seconds to sleep between polls.
-
-    :param root_dir:
-        Root directory to save files for the jobs, e.g. metadata and error logs.
-        This defaults to "." the current directory.
-
-        Each job gets its own subfolder in this root directory.
-        You can use the following methods to find the relevant paths,
-        based on the job ID:
-
-            - get_job_dir
-            - get_error_log_path
-            - get_job_metadata_path
-
-    :param cancel_running_job_after:
-        Optional temporal limit (in seconds) after which running jobs should be canceled
-        by the job manager.
-
-    .. versionadded:: 0.14.0
-
-    .. versionchanged:: 0.32.0
-        Added ``cancel_running_job_after`` parameter.
-    """
-
-    # Expected columns in the job DB dataframes.
-    # TODO: make this part of public API when settled?
-    # TODO: move non official statuses to seperate column (not_started, queued_for_start)
-
-
-    def __init__(
-        self,
-        poll_sleep: int = 60,
-        root_dir: Optional[Union[str, Path]] = ".",
-        *,
-        cancel_running_job_after: Optional[int] = None,
-    ):
-        """Create a MultiBackendJobManager."""
-        self._stop_thread = None
-        self.backends: Dict[str, _Backend] = {}
-        self.poll_sleep = poll_sleep
-        self._connections: Dict[str, _Backend] = {}
-
-        # An explicit None or "" should also default to "."
-        self._root_dir = Path(root_dir or ".")
-
-        self._cancel_running_job_after = (
-            datetime.timedelta(seconds=cancel_running_job_after) if cancel_running_job_after is not None else None
-        )
-        self._thread = None
-        self._worker_pool = None
-        # Generic cache
-        self._cache = {}
-
-    def add_backend(
-        self,
-        name: str,
-        connection: Union[Connection, Callable[[], Connection]],
-        parallel_jobs: int = 2,
-    ):
-        """
-        Register a backend with a name and a Connection getter.
-
-        :param name:
-            Name of the backend.
-        :param connection:
-            Either a Connection to the backend, or a callable to create a backend connection.
-        :param parallel_jobs:
-            Maximum number of jobs to allow in parallel on a backend.
-        """
-
-        # TODO: Code might become simpler if we turn _Backend into class move this logic there.
-        #   We would need to keep add_backend here as part of the public API though.
-        #   But the amount of unrelated "stuff to manage" would be less (better cohesion)
-        if isinstance(connection, Connection):
-            c = connection
-            connection = lambda: c
-        assert callable(connection)
-        self.backends[name] = _Backend(get_connection=connection, parallel_jobs=parallel_jobs)
-
-    def _get_connection(self, backend_name: str, resilient: bool = True) -> Connection:
-        """Get a connection for the backend and optionally make it resilient (adds retry behavior)
-
-        The default is to get a resilient connection, but if necessary you can turn it off with
-        resilient=False
-        """
-
-        # TODO: Code could be simplified if _Backend is a class and this method is moved there.
-        # TODO: Is it better to make this a public method?
-
-        # Reuse the connection if we can, in order to avoid modifying the same connection several times.
-        # This is to avoid adding the retry HTTPAdapter multiple times.
-        # Remember that the get_connection attribute on _Backend can be a Connection object instead
-        # of a callable, so we don't want to assume it is a fresh connection that doesn't have the
-        # retry adapter yet.
-        if backend_name in self._connections:
-            return self._connections[backend_name]
-
-        connection = self.backends[backend_name].get_connection()
-        # If we really need it we can skip making it resilient, but by default it should be resilient.
-        if resilient:
-            self._make_resilient(connection)
-
-        self._connections[backend_name] = connection
-        return connection
-
-    @staticmethod
-    def _make_resilient(connection):
-        """Add an HTTPAdapter that retries the request if it fails.
-
-        Retry for the following HTTP 50x statuses:
-        502 Bad Gateway
-        503 Service Unavailable
-        504 Gateway Timeout
-        """
-        # TODO: migrate this to now built-in retry configuration of `Connection` or `openeo.util.http.retry_adapter`?
-        status_forcelist = [500, 502, 503, 504]
-        retries = Retry(
-            total=MAX_RETRIES,
-            read=MAX_RETRIES,
-            other=MAX_RETRIES,
-            status=MAX_RETRIES,
-            backoff_factor=0.1,
-            status_forcelist=status_forcelist,
-            allowed_methods=["HEAD", "GET", "OPTIONS", "POST"],
-        )
-        connection.session.mount("https://", HTTPAdapter(max_retries=retries))
-        connection.session.mount("http://", HTTPAdapter(max_retries=retries))
-
-    @classmethod
-    def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
-
-        return normalize_dataframe(df)
-
-    def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface):
-        """
-        Start running the jobs in a separate thread, returns afterwards.
-
-        :param start_job:
-            A callback which will be invoked with, amongst others,
-            the row of the dataframe for which a job should be created and/or started.
-            This callable should return a :py:class:`openeo.rest.job.BatchJob` object.
-
-            The following parameters will be passed to ``start_job``:
-
-                ``row`` (:py:class:`pandas.Series`):
-                    The row in the pandas dataframe that stores the jobs state and other tracked data.
-
-                ``connection_provider``:
-                    A getter to get a connection by backend name.
-                    Typically, you would need either the parameter ``connection_provider``,
-                    or the parameter ``connection``, but likely you will not need both.
-
-                ``connection`` (:py:class:`Connection`):
-                    The :py:class:`Connection` itself, that has already been created.
-                    Typically, you would need either the parameter ``connection_provider``,
-                    or the parameter ``connection``, but likely you will not need both.
-
-                ``provider`` (``str``):
-                    The name of the backend that will run the job.
-
-            You do not have to define all the parameters described below, but if you leave
-            any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters.
-            Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``.
-        :param job_db:
-            Job database to load/store existing job status data and other metadata from/to.
-            Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
-
-            .. note::
-                Support for Parquet files depends on the ``pyarrow`` package
-                as :ref:`optional dependency <installation-optional-dependencies>`.
-
-        .. versionadded:: 0.32.0
-        """
-
-        # Resume from existing db
-        _log.info(f"Resuming `run_jobs` from existing {job_db}")
-
-        self._stop_thread = False
-        self._worker_pool = _JobManagerWorkerThreadPool()
-
-        def run_loop():
-            # TODO: support user-provided `stats`
-            stats = collections.defaultdict(int)
-
-            while (
-                sum(
-                    job_db.count_by_status(
-                        statuses=["not_started", "created", "queued", "queued_for_start", "running"]
-                    ).values()
-                )
-                > 0
-                and not self._stop_thread
-            ):
-                self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats)
-                stats["run_jobs loop"] += 1
-
-                # Show current stats and sleep
-                _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}")
-                for _ in range(int(max(1, self.poll_sleep))):
-                    time.sleep(1)
-                    if self._stop_thread:
-                        break
-
-        self._thread = Thread(target=run_loop)
-        self._thread.start()
-
-    def stop_job_thread(self, timeout_seconds: Optional[float] = _UNSET):
-        """
-        Stop the job polling thread.
-
-        :param timeout_seconds: The time to wait for the thread to stop.
-            By default, it will wait for 2 times the poll_sleep time.
-            Set to None to wait indefinitely.
-
-        .. versionadded:: 0.32.0
-        """
-        self._worker_pool.shutdown()
-
-        if self._thread is not None:
-            self._stop_thread = True
-            if timeout_seconds is _UNSET:
-                timeout_seconds = 2 * self.poll_sleep
-            self._thread.join(timeout_seconds)
-            if self._thread.is_alive():
-                _log.warning("Job thread did not stop after timeout")
-        else:
-            _log.error("No job thread to stop")
-
-    def run_jobs(
-        self,
-        df: Optional[pd.DataFrame] = None,
-        start_job: Callable[[], BatchJob] = _start_job_default,
-        job_db: Union[str, Path, JobDatabaseInterface, None] = None,
-        **kwargs,
-    ) -> dict:
-        """Runs jobs, specified in a dataframe, and tracks parameters.
-
-        :param df:
-            DataFrame that specifies the jobs, and tracks the jobs' statuses. If None, the job_db has to be specified and will be used.
-
-        :param start_job:
-            A callback which will be invoked with, amongst others,
-            the row of the dataframe for which a job should be created and/or started.
-            This callable should return a :py:class:`openeo.rest.job.BatchJob` object.
-
-            The following parameters will be passed to ``start_job``:
-
-                ``row`` (:py:class:`pandas.Series`):
-                    The row in the pandas dataframe that stores the jobs state and other tracked data.
-
-                ``connection_provider``:
-                    A getter to get a connection by backend name.
-                    Typically, you would need either the parameter ``connection_provider``,
-                    or the parameter ``connection``, but likely you will not need both.
-
-                ``connection`` (:py:class:`Connection`):
-                    The :py:class:`Connection` itself, that has already been created.
-                    Typically, you would need either the parameter ``connection_provider``,
-                    or the parameter ``connection``, but likely you will not need both.
-
-                ``provider`` (``str``):
-                    The name of the backend that will run the job.
-
-            You do not have to define all the parameters described below, but if you leave
-            any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters.
-            Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``.
-
-        :param job_db:
-            Job database to load/store existing job status data and other metadata from/to.
-            Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
-
-            .. note::
-                Support for Parquet files depends on the ``pyarrow`` package
-                as :ref:`optional dependency <installation-optional-dependencies>`.
-
-        :return: dictionary with stats collected during the job running loop.
-            Note that the set of fields in this dictionary is experimental
-            and subject to change
-
-        .. versionchanged:: 0.31.0
-            Added support for persisting the job metadata in Parquet format.
-
-        .. versionchanged:: 0.31.0
-            Replace ``output_file`` argument with ``job_db`` argument,
-            which can be a path to a CSV or Parquet file,
-            or a user-defined :py:class:`JobDatabaseInterface` object.
-            The deprecated ``output_file`` argument is still supported for now.
-
-        .. versionchanged:: 0.33.0
-            return a stats dictionary
-        """
-        # TODO Defining start_jobs as a Protocol might make its usage more clear, and avoid complicated docstrings,
-
-        # Backwards compatibility for deprecated `output_file` argument
-        if "output_file" in kwargs:
-            if job_db is not None:
-                raise ValueError("Only one of `output_file` and `job_db` should be provided")
-            warnings.warn(
-                "The `output_file` argument is deprecated. Use `job_db` instead.", DeprecationWarning, stacklevel=2
-            )
-            job_db = kwargs.pop("output_file")
-        assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}"
-
-        if isinstance(job_db, (str, Path)):
-            job_db = get_job_db(path=job_db)
-
-        if not isinstance(job_db, JobDatabaseInterface):
-            raise ValueError(f"Unsupported job_db {job_db!r}")
-
-        if job_db.exists():
-            # Resume from existing db
-            _log.info(f"Resuming `run_jobs` from existing {job_db}")
-        elif df is not None:
-            # TODO: start showing deprecation warnings for this usage pattern?
-            job_db.initialize_from_df(df)
-
-        # TODO: support user-provided `stats`
-        stats = collections.defaultdict(int)
-
-        self._worker_pool = _JobManagerWorkerThreadPool()
-
-        while (
-            sum(
-                job_db.count_by_status(
-                    statuses=["not_started", "created", "queued_for_start", "queued", "running"]
-                ).values()
-            )
-            > 0
-        ):
-            self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats)
-            stats["run_jobs loop"] += 1
-
-            # Show current stats and sleep
-            _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}")
-            time.sleep(self.poll_sleep)
-            stats["sleep"] += 1
-
-        # TODO; run post process after shutdown once more to ensure completion?
-        self._worker_pool.shutdown()
-
-        return stats
-
-    def _job_update_loop(
-        self, job_db: JobDatabaseInterface, start_job: Callable[[], BatchJob], stats: Optional[dict] = None
-    ):
-        """
-        Inner loop logic of job management:
-        go through the necessary jobs to check for status updates,
-        trigger status events, start new jobs when there is room for them, etc.
-        """
-        if not self.backends:
-            raise RuntimeError("No backends registered")
-
-        stats = stats if stats is not None else collections.defaultdict(int)
-
-        with ignore_connection_errors(context="get statuses"):
-            jobs_done, jobs_error, jobs_cancel = self._track_statuses(job_db, stats=stats)
-            stats["track_statuses"] += 1
-
-        not_started = job_db.get_by_status(statuses=["not_started"], max=200).copy()
-        if len(not_started) > 0:
-            # Check number of jobs running at each backend
-            # TODO: should "created" be included in here? Calling this "running" is quite misleading then.
-            running = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"])
-            stats["job_db get_by_status"] += 1
-            per_backend = running.groupby("backend_name").size().to_dict()
-            _log.info(f"Running per backend: {per_backend}")
-            total_added = 0
-            for backend_name in self.backends:
-                backend_load = per_backend.get(backend_name, 0)
-                if backend_load < self.backends[backend_name].parallel_jobs:
-                    to_add = self.backends[backend_name].parallel_jobs - backend_load
-                    for i in not_started.index[total_added : total_added + to_add]:
-                        self._launch_job(start_job, df=not_started, i=i, backend_name=backend_name, stats=stats)
-                        stats["job launch"] += 1
-
-                        job_db.persist(not_started.loc[i : i + 1])
-                        stats["job_db persist"] += 1
-                        total_added += 1
-
-        self._process_threadworker_updates(self._worker_pool, job_db=job_db, stats=stats)
-
-        # TODO: move this back closer to the `_track_statuses` call above, once job done/error handling is also handled in threads?
-        for job, row in jobs_done:
-            self.on_job_done(job, row)
-
-        for job, row in jobs_error:
-            self.on_job_error(job, row)
-
-        for job, row in jobs_cancel:
-            self.on_job_cancel(job, row)
-
-    def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = None):
-        """Helper method for launching jobs
-
-        :param start_job:
-            A callback which will be invoked with the row of the dataframe for which a job should be started.
-            This callable should return a :py:class:`openeo.rest.job.BatchJob` object.
-
-            See also:
-            `MultiBackendJobManager.run_jobs` for the parameters and return type of this callable
-
-            Even though it is called here in `_launch_job` and that is where the constraints
-            really come from, the public method `run_jobs` needs to document `start_job` anyway,
-            so let's avoid duplication in the docstrings.
-
-        :param df:
-            DataFrame that specifies the jobs, and tracks the jobs' statuses.
-
-        :param i:
-            index of the job's row in dataframe df
-
-        :param backend_name:
-            name of the backend that will execute the job.
-        """
-        stats = stats if stats is not None else collections.defaultdict(int)
-
-        df.loc[i, "backend_name"] = backend_name
-        row = df.loc[i]
-        try:
-            _log.info(f"Starting job on backend {backend_name} for {row.to_dict()}")
-            connection = self._get_connection(backend_name, resilient=True)
-
-            stats["start_job call"] += 1
-            job = start_job(
-                row=row,
-                connection_provider=self._get_connection,
-                connection=connection,
-                provider=backend_name,
-            )
-        except requests.exceptions.ConnectionError as e:
-            _log.warning(f"Failed to start job for {row.to_dict()}", exc_info=True)
-            df.loc[i, "status"] = "start_failed"
-            stats["start_job error"] += 1
-        else:
-            df.loc[i, "start_time"] = rfc3339.now_utc()
-            if job:
-                df.loc[i, "id"] = job.job_id
-                _log.info(f"Job created: {job.job_id}")
-                with ignore_connection_errors(context="get status"):
-                    status = job.status()
-                    stats["job get status"] += 1
-                    df.loc[i, "status"] = status
-                    if status == "created":
-                        # start job if not yet done by callback
-                        try:
-                            job_con = job.connection
-                            # Proactively refresh bearer token (because task in thread will not be able to do that)
-                            self._refresh_bearer_token(connection=job_con)
-                            task = _JobStartTask(
-                                root_url=job_con.root_url,
-                                bearer_token=job_con.auth.bearer if isinstance(job_con.auth, BearerAuth) else None,
-                                job_id=job.job_id,
-                                df_idx=i,
-                            )
-                            _log.info(f"Submitting task {task} to thread pool")
-                            self._worker_pool.submit_task(task)
-
-                            stats["job_queued_for_start"] += 1
-                            df.loc[i, "status"] = "queued_for_start"
-                        except OpenEoApiError as e:
-                            _log.info(f"Failed submitting task {task} to thread pool with error: {e}")
-                            df.loc[i, "status"] = "queued_for_start_failed"
-                            stats["job queued for start failed"] += 1
-            else:
-                # TODO: what is this "skipping" about actually?
-                df.loc[i, "status"] = "skipped"
-                stats["start_job skipped"] += 1
-
-    def _refresh_bearer_token(self, connection: Connection, *, max_age: float = 60) -> None:
-        """
-        Helper to proactively refresh the bearer (access) token of the connection
-        (but not too often, based on `max_age`).
-        """
-        # TODO: be smarter about timing, e.g. by inspecting expiry of current token?
-        now = time.time()
-        key = f"connection:{id(connection)}:refresh-time"
-        if self._cache.get(key, 0) + max_age < now:
-            refreshed = connection.try_access_token_refresh()
-            if refreshed:
-                self._cache[key] = now
-            else:
-                _log.warning("Failed to proactively refresh bearer token")
-
-    def _process_threadworker_updates(
-        self,
-        worker_pool: _JobManagerWorkerThreadPool,
-        *,
-        job_db: JobDatabaseInterface,
-        stats: Dict[str, int],
-    ) -> None:
-        """
-        Fetches completed TaskResult objects from the worker pool and applies
-        their db_update and stats_updates. Only existing DataFrame rows
-        (matched by df_idx) are upserted via job_db.persist(). Any results
-        targeting unknown df_idx indices are logged as errors but not persisted.
-
-        :param worker_pool: Thread-pool managing asynchronous Task executes
-        :param job_db:      Interface to append/upsert to the job database
-        :param stats:       Dictionary accumulating statistic counters
-        """
-        # Retrieve completed task results immediately
-        results, _ = worker_pool.process_futures(timeout=0)
-
-        # Collect update dicts
-        updates: List[Dict[str, Any]] = []
-        for res in results:
-            # Process database updates
-            if res.db_update:
-                try:
-                    updates.append(
-                        {
-                            "id": res.job_id,
-                            "df_idx": res.df_idx,
-                            **res.db_update,
-                        }
-                    )
-                except Exception as e:
-                    _log.error(f"Skipping invalid db_update {res.db_update!r} for job {res.job_id!r}: {e}")
-
-            # Process stats updates
-            if res.stats_update:
-                try:
-                    for key, val in res.stats_update.items():
-                        count = int(val)
-                        stats[key] = stats.get(key, 0) + count
-                except Exception as e:
-                    _log.error(f"Skipping invalid stats_update {res.stats_update!r} for job {res.job_id!r}: {e}")
-
-        # No valid updates: nothing to persist
-        if not updates:
-            return
-
-        # Build update DataFrame and persist
-        df_updates = job_db.get_by_indices(indices=set(u["df_idx"] for u in updates))
-        df_updates.update(pd.DataFrame(updates).set_index("df_idx", drop=True), overwrite=True)
-        job_db.persist(df_updates)
-        stats["job_db persist"] = stats.get("job_db persist", 0) + 1
-
-    def on_job_done(self, job: BatchJob, row):
-        """
-        Handles jobs that have finished. Can be overridden to provide custom behaviour.
-
-        Default implementation downloads the results into a folder containing the title.
-
-        :param job: The job that has finished.
-        :param row: DataFrame row containing the job's metadata.
-        """
-        # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use?
-
-        job_metadata = job.describe()
-        job_dir = self.get_job_dir(job.job_id)
-        metadata_path = self.get_job_metadata_path(job.job_id)
-
-        self.ensure_job_dir_exists(job.job_id)
-        job.get_results().download_files(target=job_dir)
-
-        with metadata_path.open("w", encoding="utf-8") as f:
-            json.dump(job_metadata, f, ensure_ascii=False)
-
-    def on_job_error(self, job: BatchJob, row):
-        """
-        Handles jobs that stopped with errors. Can be overridden to provide custom behaviour.
-
-        Default implementation writes the error logs to a JSON file.
-
-        :param job: The job that has finished.
-        :param row: DataFrame row containing the job's metadata.
-        """
-        # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use?
-
-        error_logs = job.logs(level="error")
-        error_log_path = self.get_error_log_path(job.job_id)
-
-        if len(error_logs) > 0:
-            self.ensure_job_dir_exists(job.job_id)
-            error_log_path.write_text(json.dumps(error_logs, indent=2))
-
-    def on_job_cancel(self, job: BatchJob, row):
-        """
-        Handle a job that was cancelled. Can be overridden to provide custom behaviour.
-
-        Default implementation does not do anything.
-
-        :param job: The job that was canceled.
-        :param row: DataFrame row containing the job's metadata.
-        """
-        pass
-
-    def _cancel_prolonged_job(self, job: BatchJob, row):
-        """Cancel the job if it has been running for too long."""
-        try:
-            # Ensure running start time is valid
-            job_running_start_time = rfc3339.parse_datetime(row.get("running_start_time"), with_timezone=True)
-
-            # Parse the current time into a datetime object with timezone info
-            current_time = rfc3339.parse_datetime(rfc3339.now_utc(), with_timezone=True)
-
-            # Calculate the elapsed time between job start and now
-            elapsed = current_time - job_running_start_time
-
-            if elapsed > self._cancel_running_job_after:
-                _log.info(
-                    f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {job_running_start_time})"
-                )
-                job.stop()
-
-        except Exception as e:
-            _log.error(f"Unexpected error while handling job {job.job_id}: {e}")
-
-    def get_job_dir(self, job_id: str) -> Path:
-        """Path to directory where job metadata, results and error logs are be saved."""
-        return self._root_dir / f"job_{job_id}"
-
-    def get_error_log_path(self, job_id: str) -> Path:
-        """Path where error log file for the job is saved."""
-        return self.get_job_dir(job_id) / f"job_{job_id}_errors.json"
-
-    def get_job_metadata_path(self, job_id: str) -> Path:
-        """Path where job metadata file is saved."""
-        return self.get_job_dir(job_id) / f"job_{job_id}.json"
-
-    def ensure_job_dir_exists(self, job_id: str) -> Path:
-        """Create the job folder if it does not exist yet."""
-        job_dir = self.get_job_dir(job_id)
-        if not job_dir.exists():
-            job_dir.mkdir(parents=True)
-
-    def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] = None) -> Tuple[List, List, List]:
-        """
-        Tracks status (and stats) of running jobs (in place).
-        Optionally cancels jobs when running too long.
-        """
-        stats = stats if stats is not None else collections.defaultdict(int)
-
-        active = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"]).copy()
-
-        jobs_done = []
-        jobs_error = []
-        jobs_cancel = []
-
-        for i in active.index:
-            job_id = active.loc[i, "id"]
-            backend_name = active.loc[i, "backend_name"]
-            previous_status = active.loc[i, "status"]
-
-            try:
-                con = self._get_connection(backend_name)
-                the_job = con.job(job_id)
-                job_metadata = the_job.describe()
-                stats["job describe"] += 1
-                new_status = job_metadata["status"]
-
-                _log.info(
-                    f"Status of job {job_id!r} (on backend {backend_name}) is {new_status!r} (previously {previous_status!r})"
-                )
-
-                if previous_status != "finished" and new_status == "finished":
-                    stats["job finished"] += 1
-                    jobs_done.append((the_job, active.loc[i]))
-
-                if previous_status != "error" and new_status == "error":
-                    stats["job failed"] += 1
-                    jobs_error.append((the_job, active.loc[i]))
-
-                if new_status == "canceled":
-                    stats["job canceled"] += 1
-                    jobs_cancel.append((the_job, active.loc[i]))
-
-                if previous_status in {"created", "queued", "queued_for_start"} and new_status == "running":
-                    stats["job started running"] += 1
-                    active.loc[i, "running_start_time"] = rfc3339.now_utc()
-
-                if self._cancel_running_job_after and new_status == "running":
-                    if not active.loc[i, "running_start_time"] or pd.isna(active.loc[i, "running_start_time"]):
-                        _log.warning(
-                            f"Unknown 'running_start_time' for running job {job_id}. Using current time as an approximation."
-                        )
-                        stats["job started running"] += 1
-                        active.loc[i, "running_start_time"] = rfc3339.now_utc()
-
-                    self._cancel_prolonged_job(the_job, active.loc[i])
-
-                active.loc[i, "status"] = new_status
-
-                # TODO: there is well hidden coupling here with "cpu", "memory" and "duration" from `_normalize_df`
-                for key in job_metadata.get("usage", {}).keys():
-                    if key in active.columns:
-                        active.loc[i, key] = _format_usage_stat(job_metadata, key)
-                if "costs" in job_metadata.keys():
-                    active.loc[i, "costs"] = job_metadata.get("costs")
-
-            except OpenEoApiError as e:
-                # TODO: inspect status code and e.g. differentiate between 4xx/5xx
-                stats["job tracking error"] += 1
-                _log.warning(f"Error while tracking status of job {job_id!r} on backend {backend_name}: {e!r}")
-
-        stats["job_db persist"] += 1
-        job_db.persist(active)
-
-        return jobs_done, jobs_error, jobs_cancel
-
-
-def _format_usage_stat(job_metadata: dict, field: str) -> str:
-    value = deep_get(job_metadata, "usage", field, "value", default=0)
-    unit = deep_get(job_metadata, "usage", field, "unit", default="")
-    return f"{value} {unit}".strip()
-
-
-@contextlib.contextmanager
-def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5):
-    """Context manager to ignore connection errors."""
-    # TODO: move this out of this module and make it a more public utility?
-    try:
-        yield
-    except requests.exceptions.ConnectionError as e:
-        _log.warning(f"Ignoring connection error (context {context or 'n/a'}): {e}")
-        # Back off a bit
-        time.sleep(sleep)
-
-
-
-def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
-    """
-    Factory to get a job database at a given path,
-    guessing the database type from filename extension.
-
-    :param path: path to job database file.
-
-    .. versionadded:: 0.33.0
-    """
-    path = Path(path)
-    if path.suffix.lower() in {".csv"}:
-        job_db = CsvJobDatabase(path=path)
-    elif path.suffix.lower() in {".parquet", ".geoparquet"}:
-        job_db = ParquetJobDatabase(path=path)
-    else:
-        raise ValueError(f"Could not guess job database type from {path!r}")
-    return job_db
-
-
-def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
-    """
-    Factory to create a job database at given path,
-    initialized from a given dataframe,
-    and its database type guessed from filename extension.
-
-    :param path: Path to the job database file.
-    :param df: DataFrame to store in the job database.
-    :param on_exists: What to do when the job database already exists:
-        - "error": (default) raise an exception
-        - "skip": work with existing database, ignore given dataframe and skip any initialization
 
-    .. versionadded:: 0.33.0
-    """
-    job_db = get_job_db(path)
-    if isinstance(job_db, FullDataFrameJobDatabase):
-        job_db.initialize_from_df(df=df, on_exists=on_exists)
-    else:
-        raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
-    return job_db
diff --git a/openeo/extra/job_management/_interface.py b/openeo/extra/job_management/_interface.py
new file mode 100644
index 000000000..32f6c9d0b
--- /dev/null
+++ b/openeo/extra/job_management/_interface.py
@@ -0,0 +1,75 @@
+import abc
+from typing import (
+    Callable,
+    Iterable,
+    List,
+    NamedTuple,
+    Union,
+)
+
+import pandas as pd
+
+class JobDatabaseInterface(metaclass=abc.ABCMeta):
+    """
+    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
+    allowing to regularly persist the job metadata while polling the job statuses
+    and resume/restart the job tracking after it was interrupted.
+
+    .. versionadded:: 0.31.0
+    """
+
+    @abc.abstractmethod
+    def exists(self) -> bool:
+        """Does the job database already exist, to read job data from?"""
+        ...
+
+    @abc.abstractmethod
+    def persist(self, df: pd.DataFrame):
+        """
+        Store (now or updated) job data to the database.
+
+        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
+        so it should be merged with the existing data (if any) instead of overwriting it completely.
+
+        :param df: job data to store.
+        """
+        ...
+
+    @abc.abstractmethod
+    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
+        """
+        Retrieve the number of jobs per status.
+
+        :param statuses: List/set of statuses to include. If empty, all statuses are included.
+
+        :return: dictionary with status as key and the count as value.
+        """
+        ...
+
+    @abc.abstractmethod
+    def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs, filtered by status.
+
+        :param statuses: List of statuses to include.
+        :param max: Maximum number of jobs to return.
+
+        :return: DataFrame with jobs filtered by status.
+        """
+        ...
+
+    @abc.abstractmethod
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs based on their (dataframe) index
+
+        :param indices: List of indices to include.
+
+        :return: DataFrame with jobs filtered by indices.
+        """
+        ...
+
+
+
+
+
diff --git a/openeo/extra/job_management/_job_database.py b/openeo/extra/job_management/_job_db.py
similarity index 65%
rename from openeo/extra/job_management/_job_database.py
rename to openeo/extra/job_management/_job_db.py
index 4be3fc1eb..6248074c9 100644
--- a/openeo/extra/job_management/_job_database.py
+++ b/openeo/extra/job_management/_job_db.py
@@ -1,104 +1,15 @@
 import abc
 import logging
-from pathlib import Path
-from typing import (
-    Iterable,
-    Union,
-    List,
-)
-
-
-import pandas as pd
 import shapely.errors
 import shapely.wkt
-
-
-
-_log = logging.getLogger(__name__)
-
+from pathlib import Path
+from typing import Iterable, Union
 import pandas as pd
 
-class _ColumnProperties:
-    def __init__(self, dtype: str, default=None):
-        self.dtype = dtype
-        self.default = default
-
+from openeo.extra.job_management._interface import JobDatabaseInterface
+from openeo.extra.job_management._manager import MultiBackendJobManager
 
-
-class JobDatabaseInterface(metaclass=abc.ABCMeta):
-    """
-    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
-    allowing to regularly persist the job metadata while polling the job statuses
-    and resume/restart the job tracking after it was interrupted.
-
-    .. versionadded:: 0.31.0
-    """
-
-    @abc.abstractmethod
-    def exists(self) -> bool:
-        """Does the job database already exist, to read job data from?"""
-        ...
-
-    @abc.abstractmethod
-    def persist(self, df: pd.DataFrame):
-        """
-        Store (now or updated) job data to the database.
-
-        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
-        so it should be merged with the existing data (if any) instead of overwriting it completely.
-
-        :param df: job data to store.
-        """
-        ...
-
-    @abc.abstractmethod
-    def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
-        """
-        Retrieve the number of jobs per status.
-
-        :param statuses: List/set of statuses to include. If empty, all statuses are included.
-
-        :return: dictionary with status as key and the count as value.
-        """
-        ...
-
-    @abc.abstractmethod
-    def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
-        """
-        Returns a dataframe with jobs, filtered by status.
-
-        :param statuses: List of statuses to include.
-        :param max: Maximum number of jobs to return.
-
-        :return: DataFrame with jobs filtered by status.
-        """
-        ...
-
-    @abc.abstractmethod
-    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
-        """
-        Returns a dataframe with jobs based on their (dataframe) index
-
-        :param indices: List of indices to include.
-
-        :return: DataFrame with jobs filtered by indices.
-        """
-        ...
-
-# Expected columns in the job DB dataframes.
-# TODO: make this part of public API when settled?
-# TODO: move non official statuses to seperate column (not_started, queued_for_start)
-COLUMN_REQUIREMENTS = {
-    "id": _ColumnProperties(dtype="str"),
-    "backend_name": _ColumnProperties(dtype="str"),
-    "status": _ColumnProperties(dtype="str", default="not_started"),
-    "start_time": _ColumnProperties(dtype="str"),
-    "running_start_time": _ColumnProperties(dtype="str"),
-    "cpu": _ColumnProperties(dtype="str"),
-    "memory": _ColumnProperties(dtype="str"),
-    "duration": _ColumnProperties(dtype="str"),
-    "costs": _ColumnProperties(dtype="float64"),
-}   
+_log = logging.getLogger(__name__)
 
 class FullDataFrameJobDatabase(JobDatabaseInterface):
     def __init__(self):
@@ -129,7 +40,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
             else:
                 # TODO handle other on_exists modes: e.g. overwrite, merge, ...
                 raise ValueError(f"Invalid on_exists={on_exists!r}")
-        df = normalize_dataframe(df)
+        df = MultiBackendJobManager._normalize_df(df)
         self.persist(df)
         # Return self to allow chaining with constructor.
         return self
@@ -187,7 +98,6 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
         return self._df.loc[list(known)]
 
 
-
 class CsvJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a CSV file.
@@ -223,7 +133,7 @@ def read(self) -> pd.DataFrame:
         df = pd.read_csv(
             self.path,
             # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
-            dtype={c: r.dtype for (c, r) in COLUMN_REQUIREMENTS.items()},
+            dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()},
         )
         if (
             "geometry" in df.columns
@@ -242,6 +152,7 @@ def persist(self, df: pd.DataFrame):
         self.path.parent.mkdir(parents=True, exist_ok=True)
         self.df.to_csv(self.path, index=False)
 
+
 class ParquetJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a Parquet file.
@@ -289,40 +200,8 @@ def read(self) -> pd.DataFrame:
     def persist(self, df: pd.DataFrame):
         self._merge_into_df(df)
         self.path.parent.mkdir(parents=True, exist_ok=True)
-        self.df.to_parquet(self.path, index=False)    
-
-def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Normalize given pandas dataframe (creating a new one):
-    ensure we have the required columns.
-
-    :param df: The dataframe to normalize.
-    :return: a new dataframe that is normalized.
-    """
-    new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns}
-    df = df.assign(**new_columns)
-    return df 
-
-def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
-    """
-    Factory to create a job database at given path,
-    initialized from a given dataframe,
-    and its database type guessed from filename extension.
+        self.df.to_parquet(self.path, index=False)
 
-    :param path: Path to the job database file.
-    :param df: DataFrame to store in the job database.
-    :param on_exists: What to do when the job database already exists:
-        - "error": (default) raise an exception
-        - "skip": work with existing database, ignore given dataframe and skip any initialization
-
-    .. versionadded:: 0.33.0
-    """
-    job_db = get_job_db(path)
-    if isinstance(job_db, FullDataFrameJobDatabase):
-        job_db.initialize_from_df(df=df, on_exists=on_exists)
-    else:
-        raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
-    return job_db
 
 def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
     """
@@ -362,5 +241,4 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str =
         job_db.initialize_from_df(df=df, on_exists=on_exists)
     else:
         raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
-    return job_db
-
+    return job_db
\ No newline at end of file
diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py
new file mode 100644
index 000000000..1c9dae4a8
--- /dev/null
+++ b/openeo/extra/job_management/_manager.py
@@ -0,0 +1,853 @@
+import collections
+import contextlib
+import dataclasses
+import datetime
+import json
+import logging
+import time
+import warnings
+from pathlib import Path
+from threading import Thread
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import pandas as pd
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+
+from openeo import BatchJob, Connection
+from openeo.extra.job_management._thread_worker import (
+    _JobManagerWorkerThreadPool,
+    _JobStartTask,
+)
+from openeo.extra.job_management._interface import JobDatabaseInterface
+
+from openeo.rest import OpenEoApiError
+from openeo.rest.auth.auth import BearerAuth
+from openeo.util import deep_get, rfc3339
+
+
+_log = logging.getLogger(__name__)
+
+MAX_RETRIES = 50
+# Sentinel value to indicate that a parameter was not set
+_UNSET = object()
+
+
+class _Backend(NamedTuple):
+    """Container for backend info/settings"""
+
+    # callable to create a backend connection
+    get_connection: Callable[[], Connection]
+    # Maximum number of jobs to allow in parallel on a backend
+    parallel_jobs: int
+
+@dataclasses.dataclass(frozen=True)
+class _ColumnProperties:
+    """Expected/required properties of a column in the job manager related dataframes"""
+
+    dtype: str = "object"
+    default: Any = None
+
+
+
+
+class MultiBackendJobManager:
+    """
+    Tracker for multiple jobs on multiple backends.
+
+    Usage example:
+
+    .. code-block:: python
+
+        import logging
+        import pandas as pd
+        import openeo
+        from openeo.extra.job_management import MultiBackendJobManager
+
+        logging.basicConfig(
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+            level=logging.INFO
+        )
+
+        manager = MultiBackendJobManager()
+        manager.add_backend("foo", connection=openeo.connect("http://foo.test"))
+        manager.add_backend("bar", connection=openeo.connect("http://bar.test"))
+
+        jobs_df = pd.DataFrame(...)
+        output_file = "jobs.csv"
+
+        def start_job(
+            row: pd.Series,
+            connection: openeo.Connection,
+            **kwargs
+        ) -> openeo.BatchJob:
+            year = row["year"]
+            cube = connection.load_collection(
+                ...,
+                temporal_extent=[f"{year}-01-01", f"{year+1}-01-01"],
+            )
+            ...
+            return cube.create_job(...)
+
+        manager.run_jobs(df=jobs_df, start_job=start_job, output_file=output_file)
+
+    See :py:meth:`.run_jobs` for more information on the ``start_job`` callable.
+
+    :param poll_sleep:
+        How many seconds to sleep between polls.
+
+    :param root_dir:
+        Root directory to save files for the jobs, e.g. metadata and error logs.
+        This defaults to "." the current directory.
+
+        Each job gets its own subfolder in this root directory.
+        You can use the following methods to find the relevant paths,
+        based on the job ID:
+
+            - get_job_dir
+            - get_error_log_path
+            - get_job_metadata_path
+
+    :param cancel_running_job_after:
+        Optional temporal limit (in seconds) after which running jobs should be canceled
+        by the job manager.
+
+    .. versionadded:: 0.14.0
+
+    .. versionchanged:: 0.32.0
+        Added ``cancel_running_job_after`` parameter.
+    """
+
+    # Expected columns in the job DB dataframes.
+    # TODO: make this part of public API when settled?
+    # TODO: move non official statuses to seperate column (not_started, queued_for_start)
+    _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
+        "id": _ColumnProperties(dtype="str"),
+        "backend_name": _ColumnProperties(dtype="str"),
+        "status": _ColumnProperties(dtype="str", default="not_started"),
+        # TODO: use proper date/time dtype instead of legacy str for start times?
+        "start_time": _ColumnProperties(dtype="str"),
+        "running_start_time": _ColumnProperties(dtype="str"),
+        # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
+        #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
+        #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
+        "cpu": _ColumnProperties(dtype="str"),
+        "memory": _ColumnProperties(dtype="str"),
+        "duration": _ColumnProperties(dtype="str"),
+        "costs": _ColumnProperties(dtype="float64"),
+    }
+
+    def __init__(
+        self,
+        poll_sleep: int = 60,
+        root_dir: Optional[Union[str, Path]] = ".",
+        *,
+        cancel_running_job_after: Optional[int] = None,
+    ):
+        """Create a MultiBackendJobManager."""
+        self._stop_thread = None
+        self.backends: Dict[str, _Backend] = {}
+        self.poll_sleep = poll_sleep
+        self._connections: Dict[str, _Backend] = {}
+
+        # An explicit None or "" should also default to "."
+        self._root_dir = Path(root_dir or ".")
+
+        self._cancel_running_job_after = (
+            datetime.timedelta(seconds=cancel_running_job_after) if cancel_running_job_after is not None else None
+        )
+        self._thread = None
+        self._worker_pool = None
+        # Generic cache
+        self._cache = {}
+
+    def add_backend(
+        self,
+        name: str,
+        connection: Union[Connection, Callable[[], Connection]],
+        parallel_jobs: int = 2,
+    ):
+        """
+        Register a backend with a name and a Connection getter.
+
+        :param name:
+            Name of the backend.
+        :param connection:
+            Either a Connection to the backend, or a callable to create a backend connection.
+        :param parallel_jobs:
+            Maximum number of jobs to allow in parallel on a backend.
+        """
+
+        # TODO: Code might become simpler if we turn _Backend into class move this logic there.
+        #   We would need to keep add_backend here as part of the public API though.
+        #   But the amount of unrelated "stuff to manage" would be less (better cohesion)
+        if isinstance(connection, Connection):
+            c = connection
+            connection = lambda: c
+        assert callable(connection)
+        self.backends[name] = _Backend(get_connection=connection, parallel_jobs=parallel_jobs)
+
+    def _get_connection(self, backend_name: str, resilient: bool = True) -> Connection:
+        """Get a connection for the backend and optionally make it resilient (adds retry behavior)
+
+        The default is to get a resilient connection, but if necessary you can turn it off with
+        resilient=False
+        """
+
+        # TODO: Code could be simplified if _Backend is a class and this method is moved there.
+        # TODO: Is it better to make this a public method?
+
+        # Reuse the connection if we can, in order to avoid modifying the same connection several times.
+        # This is to avoid adding the retry HTTPAdapter multiple times.
+        # Remember that the get_connection attribute on _Backend can be a Connection object instead
+        # of a callable, so we don't want to assume it is a fresh connection that doesn't have the
+        # retry adapter yet.
+        if backend_name in self._connections:
+            return self._connections[backend_name]
+
+        connection = self.backends[backend_name].get_connection()
+        # If we really need it we can skip making it resilient, but by default it should be resilient.
+        if resilient:
+            self._make_resilient(connection)
+
+        self._connections[backend_name] = connection
+        return connection
+
+    @staticmethod
+    def _make_resilient(connection):
+        """Add an HTTPAdapter that retries the request if it fails.
+
+        Retry for the following HTTP 50x statuses:
+        502 Bad Gateway
+        503 Service Unavailable
+        504 Gateway Timeout
+        """
+        # TODO: migrate this to now built-in retry configuration of `Connection` or `openeo.util.http.retry_adapter`?
+        status_forcelist = [500, 502, 503, 504]
+        retries = Retry(
+            total=MAX_RETRIES,
+            read=MAX_RETRIES,
+            other=MAX_RETRIES,
+            status=MAX_RETRIES,
+            backoff_factor=0.1,
+            status_forcelist=status_forcelist,
+            allowed_methods=["HEAD", "GET", "OPTIONS", "POST"],
+        )
+        connection.session.mount("https://", HTTPAdapter(max_retries=retries))
+        connection.session.mount("http://", HTTPAdapter(max_retries=retries))
+
+    @classmethod
+    def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Normalize given pandas dataframe (creating a new one):
+        ensure we have the required columns.
+
+        :param df: The dataframe to normalize.
+        :return: a new dataframe that is normalized.
+        """
+        new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns}
+        df = df.assign(**new_columns)
+
+        return df
+
+    def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface):
+        """
+        Start running the jobs in a separate thread, returns afterwards.
+
+        :param start_job:
+            A callback which will be invoked with, amongst others,
+            the row of the dataframe for which a job should be created and/or started.
+            This callable should return a :py:class:`openeo.rest.job.BatchJob` object.
+
+            The following parameters will be passed to ``start_job``:
+
+                ``row`` (:py:class:`pandas.Series`):
+                    The row in the pandas dataframe that stores the jobs state and other tracked data.
+
+                ``connection_provider``:
+                    A getter to get a connection by backend name.
+                    Typically, you would need either the parameter ``connection_provider``,
+                    or the parameter ``connection``, but likely you will not need both.
+
+                ``connection`` (:py:class:`Connection`):
+                    The :py:class:`Connection` itself, that has already been created.
+                    Typically, you would need either the parameter ``connection_provider``,
+                    or the parameter ``connection``, but likely you will not need both.
+
+                ``provider`` (``str``):
+                    The name of the backend that will run the job.
+
+            You do not have to define all the parameters described below, but if you leave
+            any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters.
+            Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``.
+        :param job_db:
+            Job database to load/store existing job status data and other metadata from/to.
+            Can be specified as a path to CSV or Parquet file,
+            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+
+            .. note::
+                Support for Parquet files depends on the ``pyarrow`` package
+                as :ref:`optional dependency <installation-optional-dependencies>`.
+
+        .. versionadded:: 0.32.0
+        """
+
+        # Resume from existing db
+        _log.info(f"Resuming `run_jobs` from existing {job_db}")
+
+        self._stop_thread = False
+        self._worker_pool = _JobManagerWorkerThreadPool()
+
+        def run_loop():
+            # TODO: support user-provided `stats`
+            stats = collections.defaultdict(int)
+
+            while (
+                sum(
+                    job_db.count_by_status(
+                        statuses=["not_started", "created", "queued", "queued_for_start", "running"]
+                    ).values()
+                )
+                > 0
+                and not self._stop_thread
+            ):
+                self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats)
+                stats["run_jobs loop"] += 1
+
+                # Show current stats and sleep
+                _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}")
+                for _ in range(int(max(1, self.poll_sleep))):
+                    time.sleep(1)
+                    if self._stop_thread:
+                        break
+
+        self._thread = Thread(target=run_loop)
+        self._thread.start()
+
+    def stop_job_thread(self, timeout_seconds: Optional[float] = _UNSET):
+        """
+        Stop the job polling thread.
+
+        :param timeout_seconds: The time to wait for the thread to stop.
+            By default, it will wait for 2 times the poll_sleep time.
+            Set to None to wait indefinitely.
+
+        .. versionadded:: 0.32.0
+        """
+        self._worker_pool.shutdown()
+
+        if self._thread is not None:
+            self._stop_thread = True
+            if timeout_seconds is _UNSET:
+                timeout_seconds = 2 * self.poll_sleep
+            self._thread.join(timeout_seconds)
+            if self._thread.is_alive():
+                _log.warning("Job thread did not stop after timeout")
+        else:
+            _log.error("No job thread to stop")
+
+    def run_jobs(
+        self,
+        df: Optional[pd.DataFrame] = None,
+        start_job: Callable[[], BatchJob] = _start_job_default,
+        job_db: Union[str, Path, JobDatabaseInterface, None] = None,
+        **kwargs,
+    ) -> dict:
+        """Runs jobs, specified in a dataframe, and tracks parameters.
+
+        :param df:
+            DataFrame that specifies the jobs, and tracks the jobs' statuses. If None, the job_db has to be specified and will be used.
+
+        :param start_job:
+            A callback which will be invoked with, amongst others,
+            the row of the dataframe for which a job should be created and/or started.
+            This callable should return a :py:class:`openeo.rest.job.BatchJob` object.
+
+            The following parameters will be passed to ``start_job``:
+
+                ``row`` (:py:class:`pandas.Series`):
+                    The row in the pandas dataframe that stores the jobs state and other tracked data.
+
+                ``connection_provider``:
+                    A getter to get a connection by backend name.
+                    Typically, you would need either the parameter ``connection_provider``,
+                    or the parameter ``connection``, but likely you will not need both.
+
+                ``connection`` (:py:class:`Connection`):
+                    The :py:class:`Connection` itself, that has already been created.
+                    Typically, you would need either the parameter ``connection_provider``,
+                    or the parameter ``connection``, but likely you will not need both.
+
+                ``provider`` (``str``):
+                    The name of the backend that will run the job.
+
+            You do not have to define all the parameters described below, but if you leave
+            any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters.
+            Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``.
+
+        :param job_db:
+            Job database to load/store existing job status data and other metadata from/to.
+            Can be specified as a path to CSV or Parquet file,
+            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+
+            .. note::
+                Support for Parquet files depends on the ``pyarrow`` package
+                as :ref:`optional dependency <installation-optional-dependencies>`.
+
+        :return: dictionary with stats collected during the job running loop.
+            Note that the set of fields in this dictionary is experimental
+            and subject to change
+
+        .. versionchanged:: 0.31.0
+            Added support for persisting the job metadata in Parquet format.
+
+        .. versionchanged:: 0.31.0
+            Replace ``output_file`` argument with ``job_db`` argument,
+            which can be a path to a CSV or Parquet file,
+            or a user-defined :py:class:`JobDatabaseInterface` object.
+            The deprecated ``output_file`` argument is still supported for now.
+
+        .. versionchanged:: 0.33.0
+            return a stats dictionary
+        """
+        # TODO Defining start_jobs as a Protocol might make its usage more clear, and avoid complicated docstrings,
+
+        # Backwards compatibility for deprecated `output_file` argument
+        if "output_file" in kwargs:
+            if job_db is not None:
+                raise ValueError("Only one of `output_file` and `job_db` should be provided")
+            warnings.warn(
+                "The `output_file` argument is deprecated. Use `job_db` instead.", DeprecationWarning, stacklevel=2
+            )
+            job_db = kwargs.pop("output_file")
+        assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}"
+
+        if isinstance(job_db, (str, Path)):
+            job_db = get_job_db(path=job_db)
+
+        if not isinstance(job_db, JobDatabaseInterface):
+            raise ValueError(f"Unsupported job_db {job_db!r}")
+
+        if job_db.exists():
+            # Resume from existing db
+            _log.info(f"Resuming `run_jobs` from existing {job_db}")
+        elif df is not None:
+            # TODO: start showing deprecation warnings for this usage pattern?
+            job_db.initialize_from_df(df)
+
+        # TODO: support user-provided `stats`
+        stats = collections.defaultdict(int)
+
+        self._worker_pool = _JobManagerWorkerThreadPool()
+
+        while (
+            sum(
+                job_db.count_by_status(
+                    statuses=["not_started", "created", "queued_for_start", "queued", "running"]
+                ).values()
+            )
+            > 0
+        ):
+            self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats)
+            stats["run_jobs loop"] += 1
+
+            # Show current stats and sleep
+            _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}")
+            time.sleep(self.poll_sleep)
+            stats["sleep"] += 1
+
+        # TODO; run post process after shutdown once more to ensure completion?
+        self._worker_pool.shutdown()
+
+        return stats
+
+    def _job_update_loop(
+        self, job_db: JobDatabaseInterface, start_job: Callable[[], BatchJob], stats: Optional[dict] = None
+    ):
+        """
+        Inner loop logic of job management:
+        go through the necessary jobs to check for status updates,
+        trigger status events, start new jobs when there is room for them, etc.
+        """
+        if not self.backends:
+            raise RuntimeError("No backends registered")
+
+        stats = stats if stats is not None else collections.defaultdict(int)
+
+        with ignore_connection_errors(context="get statuses"):
+            jobs_done, jobs_error, jobs_cancel = self._track_statuses(job_db, stats=stats)
+            stats["track_statuses"] += 1
+
+        not_started = job_db.get_by_status(statuses=["not_started"], max=200).copy()
+        if len(not_started) > 0:
+            # Check number of jobs running at each backend
+            # TODO: should "created" be included in here? Calling this "running" is quite misleading then.
+            running = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"])
+            stats["job_db get_by_status"] += 1
+            per_backend = running.groupby("backend_name").size().to_dict()
+            _log.info(f"Running per backend: {per_backend}")
+            total_added = 0
+            for backend_name in self.backends:
+                backend_load = per_backend.get(backend_name, 0)
+                if backend_load < self.backends[backend_name].parallel_jobs:
+                    to_add = self.backends[backend_name].parallel_jobs - backend_load
+                    for i in not_started.index[total_added : total_added + to_add]:
+                        self._launch_job(start_job, df=not_started, i=i, backend_name=backend_name, stats=stats)
+                        stats["job launch"] += 1
+
+                        job_db.persist(not_started.loc[i : i + 1])
+                        stats["job_db persist"] += 1
+                        total_added += 1
+
+        self._process_threadworker_updates(self._worker_pool, job_db=job_db, stats=stats)
+
+        # TODO: move this back closer to the `_track_statuses` call above, once job done/error handling is also handled in threads?
+        for job, row in jobs_done:
+            self.on_job_done(job, row)
+
+        for job, row in jobs_error:
+            self.on_job_error(job, row)
+
+        for job, row in jobs_cancel:
+            self.on_job_cancel(job, row)
+
+    def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = None):
+        """Helper method for launching jobs
+
+        :param start_job:
+            A callback which will be invoked with the row of the dataframe for which a job should be started.
+            This callable should return a :py:class:`openeo.rest.job.BatchJob` object.
+
+            See also:
+            `MultiBackendJobManager.run_jobs` for the parameters and return type of this callable
+
+            Even though it is called here in `_launch_job` and that is where the constraints
+            really come from, the public method `run_jobs` needs to document `start_job` anyway,
+            so let's avoid duplication in the docstrings.
+
+        :param df:
+            DataFrame that specifies the jobs, and tracks the jobs' statuses.
+
+        :param i:
+            index of the job's row in dataframe df
+
+        :param backend_name:
+            name of the backend that will execute the job.
+        """
+        stats = stats if stats is not None else collections.defaultdict(int)
+
+        df.loc[i, "backend_name"] = backend_name
+        row = df.loc[i]
+        try:
+            _log.info(f"Starting job on backend {backend_name} for {row.to_dict()}")
+            connection = self._get_connection(backend_name, resilient=True)
+
+            stats["start_job call"] += 1
+            job = start_job(
+                row=row,
+                connection_provider=self._get_connection,
+                connection=connection,
+                provider=backend_name,
+            )
+        except requests.exceptions.ConnectionError as e:
+            _log.warning(f"Failed to start job for {row.to_dict()}", exc_info=True)
+            df.loc[i, "status"] = "start_failed"
+            stats["start_job error"] += 1
+        else:
+            df.loc[i, "start_time"] = rfc3339.now_utc()
+            if job:
+                df.loc[i, "id"] = job.job_id
+                _log.info(f"Job created: {job.job_id}")
+                with ignore_connection_errors(context="get status"):
+                    status = job.status()
+                    stats["job get status"] += 1
+                    df.loc[i, "status"] = status
+                    if status == "created":
+                        # start job if not yet done by callback
+                        try:
+                            job_con = job.connection
+                            # Proactively refresh bearer token (because task in thread will not be able to do that)
+                            self._refresh_bearer_token(connection=job_con)
+                            task = _JobStartTask(
+                                root_url=job_con.root_url,
+                                bearer_token=job_con.auth.bearer if isinstance(job_con.auth, BearerAuth) else None,
+                                job_id=job.job_id,
+                                df_idx=i,
+                            )
+                            _log.info(f"Submitting task {task} to thread pool")
+                            self._worker_pool.submit_task(task)
+
+                            stats["job_queued_for_start"] += 1
+                            df.loc[i, "status"] = "queued_for_start"
+                        except OpenEoApiError as e:
+                            _log.info(f"Failed submitting task {task} to thread pool with error: {e}")
+                            df.loc[i, "status"] = "queued_for_start_failed"
+                            stats["job queued for start failed"] += 1
+            else:
+                # TODO: what is this "skipping" about actually?
+                df.loc[i, "status"] = "skipped"
+                stats["start_job skipped"] += 1
+
+    def _refresh_bearer_token(self, connection: Connection, *, max_age: float = 60) -> None:
+        """
+        Helper to proactively refresh the bearer (access) token of the connection
+        (but not too often, based on `max_age`).
+        """
+        # TODO: be smarter about timing, e.g. by inspecting expiry of current token?
+        now = time.time()
+        key = f"connection:{id(connection)}:refresh-time"
+        if self._cache.get(key, 0) + max_age < now:
+            refreshed = connection.try_access_token_refresh()
+            if refreshed:
+                self._cache[key] = now
+            else:
+                _log.warning("Failed to proactively refresh bearer token")
+
+    def _process_threadworker_updates(
+        self,
+        worker_pool: _JobManagerWorkerThreadPool,
+        *,
+        job_db: JobDatabaseInterface,
+        stats: Dict[str, int],
+    ) -> None:
+        """
+        Fetches completed TaskResult objects from the worker pool and applies
+        their db_update and stats_updates. Only existing DataFrame rows
+        (matched by df_idx) are upserted via job_db.persist(). Any results
+        targeting unknown df_idx indices are logged as errors but not persisted.
+
+        :param worker_pool: Thread-pool managing asynchronous Task executes
+        :param job_db:      Interface to append/upsert to the job database
+        :param stats:       Dictionary accumulating statistic counters
+        """
+        # Retrieve completed task results immediately
+        results, _ = worker_pool.process_futures(timeout=0)
+
+        # Collect update dicts
+        updates: List[Dict[str, Any]] = []
+        for res in results:
+            # Process database updates
+            if res.db_update:
+                try:
+                    updates.append(
+                        {
+                            "id": res.job_id,
+                            "df_idx": res.df_idx,
+                            **res.db_update,
+                        }
+                    )
+                except Exception as e:
+                    _log.error(f"Skipping invalid db_update {res.db_update!r} for job {res.job_id!r}: {e}")
+
+            # Process stats updates
+            if res.stats_update:
+                try:
+                    for key, val in res.stats_update.items():
+                        count = int(val)
+                        stats[key] = stats.get(key, 0) + count
+                except Exception as e:
+                    _log.error(f"Skipping invalid stats_update {res.stats_update!r} for job {res.job_id!r}: {e}")
+
+        # No valid updates: nothing to persist
+        if not updates:
+            return
+
+        # Build update DataFrame and persist
+        df_updates = job_db.get_by_indices(indices=set(u["df_idx"] for u in updates))
+        df_updates.update(pd.DataFrame(updates).set_index("df_idx", drop=True), overwrite=True)
+        job_db.persist(df_updates)
+        stats["job_db persist"] = stats.get("job_db persist", 0) + 1
+
+    def on_job_done(self, job: BatchJob, row):
+        """
+        Handles jobs that have finished. Can be overridden to provide custom behaviour.
+
+        Default implementation downloads the results into a folder containing the title.
+
+        :param job: The job that has finished.
+        :param row: DataFrame row containing the job's metadata.
+        """
+        # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use?
+
+        job_metadata = job.describe()
+        job_dir = self.get_job_dir(job.job_id)
+        metadata_path = self.get_job_metadata_path(job.job_id)
+
+        self.ensure_job_dir_exists(job.job_id)
+        job.get_results().download_files(target=job_dir)
+
+        with metadata_path.open("w", encoding="utf-8") as f:
+            json.dump(job_metadata, f, ensure_ascii=False)
+
+    def on_job_error(self, job: BatchJob, row):
+        """
+        Handles jobs that stopped with errors. Can be overridden to provide custom behaviour.
+
+        Default implementation writes the error logs to a JSON file.
+
+        :param job: The job that has finished.
+        :param row: DataFrame row containing the job's metadata.
+        """
+        # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use?
+
+        error_logs = job.logs(level="error")
+        error_log_path = self.get_error_log_path(job.job_id)
+
+        if len(error_logs) > 0:
+            self.ensure_job_dir_exists(job.job_id)
+            error_log_path.write_text(json.dumps(error_logs, indent=2))
+
+    def on_job_cancel(self, job: BatchJob, row):
+        """
+        Handle a job that was cancelled. Can be overridden to provide custom behaviour.
+
+        Default implementation does not do anything.
+
+        :param job: The job that was canceled.
+        :param row: DataFrame row containing the job's metadata.
+        """
+        pass
+
+    def _cancel_prolonged_job(self, job: BatchJob, row):
+        """Cancel the job if it has been running for too long."""
+        try:
+            # Ensure running start time is valid
+            job_running_start_time = rfc3339.parse_datetime(row.get("running_start_time"), with_timezone=True)
+
+            # Parse the current time into a datetime object with timezone info
+            current_time = rfc3339.parse_datetime(rfc3339.now_utc(), with_timezone=True)
+
+            # Calculate the elapsed time between job start and now
+            elapsed = current_time - job_running_start_time
+
+            if elapsed > self._cancel_running_job_after:
+                _log.info(
+                    f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {job_running_start_time})"
+                )
+                job.stop()
+
+        except Exception as e:
+            _log.error(f"Unexpected error while handling job {job.job_id}: {e}")
+
+    def get_job_dir(self, job_id: str) -> Path:
+        """Path to directory where job metadata, results and error logs are be saved."""
+        return self._root_dir / f"job_{job_id}"
+
+    def get_error_log_path(self, job_id: str) -> Path:
+        """Path where error log file for the job is saved."""
+        return self.get_job_dir(job_id) / f"job_{job_id}_errors.json"
+
+    def get_job_metadata_path(self, job_id: str) -> Path:
+        """Path where job metadata file is saved."""
+        return self.get_job_dir(job_id) / f"job_{job_id}.json"
+
+    def ensure_job_dir_exists(self, job_id: str) -> Path:
+        """Create the job folder if it does not exist yet."""
+        job_dir = self.get_job_dir(job_id)
+        if not job_dir.exists():
+            job_dir.mkdir(parents=True)
+
+    def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] = None) -> Tuple[List, List, List]:
+        """
+        Tracks status (and stats) of running jobs (in place).
+        Optionally cancels jobs when running too long.
+        """
+        stats = stats if stats is not None else collections.defaultdict(int)
+
+        active = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"]).copy()
+
+        jobs_done = []
+        jobs_error = []
+        jobs_cancel = []
+
+        for i in active.index:
+            job_id = active.loc[i, "id"]
+            backend_name = active.loc[i, "backend_name"]
+            previous_status = active.loc[i, "status"]
+
+            try:
+                con = self._get_connection(backend_name)
+                the_job = con.job(job_id)
+                job_metadata = the_job.describe()
+                stats["job describe"] += 1
+                new_status = job_metadata["status"]
+
+                _log.info(
+                    f"Status of job {job_id!r} (on backend {backend_name}) is {new_status!r} (previously {previous_status!r})"
+                )
+
+                if previous_status != "finished" and new_status == "finished":
+                    stats["job finished"] += 1
+                    jobs_done.append((the_job, active.loc[i]))
+
+                if previous_status != "error" and new_status == "error":
+                    stats["job failed"] += 1
+                    jobs_error.append((the_job, active.loc[i]))
+
+                if new_status == "canceled":
+                    stats["job canceled"] += 1
+                    jobs_cancel.append((the_job, active.loc[i]))
+
+                if previous_status in {"created", "queued", "queued_for_start"} and new_status == "running":
+                    stats["job started running"] += 1
+                    active.loc[i, "running_start_time"] = rfc3339.now_utc()
+
+                if self._cancel_running_job_after and new_status == "running":
+                    if not active.loc[i, "running_start_time"] or pd.isna(active.loc[i, "running_start_time"]):
+                        _log.warning(
+                            f"Unknown 'running_start_time' for running job {job_id}. Using current time as an approximation."
+                        )
+                        stats["job started running"] += 1
+                        active.loc[i, "running_start_time"] = rfc3339.now_utc()
+
+                    self._cancel_prolonged_job(the_job, active.loc[i])
+
+                active.loc[i, "status"] = new_status
+
+                # TODO: there is well hidden coupling here with "cpu", "memory" and "duration" from `_normalize_df`
+                for key in job_metadata.get("usage", {}).keys():
+                    if key in active.columns:
+                        active.loc[i, key] = _format_usage_stat(job_metadata, key)
+                if "costs" in job_metadata.keys():
+                    active.loc[i, "costs"] = job_metadata.get("costs")
+
+            except OpenEoApiError as e:
+                # TODO: inspect status code and e.g. differentiate between 4xx/5xx
+                stats["job tracking error"] += 1
+                _log.warning(f"Error while tracking status of job {job_id!r} on backend {backend_name}: {e!r}")
+
+        stats["job_db persist"] += 1
+        job_db.persist(active)
+
+        return jobs_done, jobs_error, jobs_cancel
+    
+def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
+    raise NotImplementedError("No 'start_job' callable provided")
+
+def _format_usage_stat(job_metadata: dict, field: str) -> str:
+    value = deep_get(job_metadata, "usage", field, "value", default=0)
+    unit = deep_get(job_metadata, "usage", field, "unit", default="")
+    return f"{value} {unit}".strip()
+
+
+@contextlib.contextmanager
+def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5):
+    """Context manager to ignore connection errors."""
+    # TODO: move this out of this module and make it a more public utility?
+    try:
+        yield
+    except requests.exceptions.ConnectionError as e:
+        _log.warning(f"Ignoring connection error (context {context or 'n/a'}): {e}")
+        # Back off a bit
+        time.sleep(sleep)

From f635528308233908f0654bf2f6e9eb19f0474e2e Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Wed, 5 Nov 2025 15:14:53 +0100
Subject: [PATCH 07/10] splitting up text and detecting circular import

---
 openeo/extra/job_management/_job_db.py        |  39 --
 openeo/extra/job_management/_manager.py       |   7 +-
 tests/extra/job_management/test_job_db.py     | 381 +++++++++++++++++
 ...test_job_management.py => test_manager.py} | 383 +-----------------
 4 files changed, 396 insertions(+), 414 deletions(-)
 create mode 100644 tests/extra/job_management/test_job_db.py
 rename tests/extra/job_management/{test_job_management.py => test_manager.py} (73%)

diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py
index 6248074c9..c6e3521fd 100644
--- a/openeo/extra/job_management/_job_db.py
+++ b/openeo/extra/job_management/_job_db.py
@@ -203,42 +203,3 @@ def persist(self, df: pd.DataFrame):
         self.df.to_parquet(self.path, index=False)
 
 
-def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
-    """
-    Factory to get a job database at a given path,
-    guessing the database type from filename extension.
-
-    :param path: path to job database file.
-
-    .. versionadded:: 0.33.0
-    """
-    path = Path(path)
-    if path.suffix.lower() in {".csv"}:
-        job_db = CsvJobDatabase(path=path)
-    elif path.suffix.lower() in {".parquet", ".geoparquet"}:
-        job_db = ParquetJobDatabase(path=path)
-    else:
-        raise ValueError(f"Could not guess job database type from {path!r}")
-    return job_db
-
-
-def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
-    """
-    Factory to create a job database at given path,
-    initialized from a given dataframe,
-    and its database type guessed from filename extension.
-
-    :param path: Path to the job database file.
-    :param df: DataFrame to store in the job database.
-    :param on_exists: What to do when the job database already exists:
-        - "error": (default) raise an exception
-        - "skip": work with existing database, ignore given dataframe and skip any initialization
-
-    .. versionadded:: 0.33.0
-    """
-    job_db = get_job_db(path)
-    if isinstance(job_db, FullDataFrameJobDatabase):
-        job_db.initialize_from_df(df=df, on_exists=on_exists)
-    else:
-        raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
-    return job_db
\ No newline at end of file
diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py
index 1c9dae4a8..febb15bae 100644
--- a/openeo/extra/job_management/_manager.py
+++ b/openeo/extra/job_management/_manager.py
@@ -31,6 +31,7 @@
     _JobStartTask,
 )
 from openeo.extra.job_management._interface import JobDatabaseInterface
+#from openeo.extra.job_management._job_db import get_job_db #TODO circular import
 
 from openeo.rest import OpenEoApiError
 from openeo.rest.auth.auth import BearerAuth
@@ -43,6 +44,8 @@
 # Sentinel value to indicate that a parameter was not set
 _UNSET = object()
 
+def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
+    raise NotImplementedError("No 'start_job' callable provided")
 
 class _Backend(NamedTuple):
     """Container for backend info/settings"""
@@ -434,7 +437,7 @@ def run_jobs(
         assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}"
 
         if isinstance(job_db, (str, Path)):
-            job_db = get_job_db(path=job_db)
+            job_db = get_job_db(path=job_db) #TODO circular import
 
         if not isinstance(job_db, JobDatabaseInterface):
             raise ValueError(f"Unsupported job_db {job_db!r}")
@@ -832,8 +835,6 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] =
 
         return jobs_done, jobs_error, jobs_cancel
     
-def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
-    raise NotImplementedError("No 'start_job' callable provided")
 
 def _format_usage_stat(job_metadata: dict, field: str) -> str:
     value = deep_get(job_metadata, "usage", field, "value", default=0)
diff --git a/tests/extra/job_management/test_job_db.py b/tests/extra/job_management/test_job_db.py
new file mode 100644
index 000000000..53ccce8d5
--- /dev/null
+++ b/tests/extra/job_management/test_job_db.py
@@ -0,0 +1,381 @@
+
+import re
+import geopandas
+
+# TODO: can we avoid using httpretty?
+#   We need it for testing the resilience, which uses an HTTPadapter with Retry
+#   but requests-mock also uses an HTTPAdapter for the mocking and basically
+#   erases the HTTPAdapter we have set up.
+#   httpretty avoids this specific problem because it mocks at the socket level,
+#   But I would rather not have two dependencies with almost the same goal.
+import pandas as pd
+import pytest
+import shapely.geometry
+
+from openeo.extra.job_management._job_db import (
+    CsvJobDatabase,
+    ParquetJobDatabase,
+    create_job_db,
+    get_job_db,
+)
+
+from openeo.utils.version import ComparableVersion
+
+JOB_DB_DF_BASICS = pd.DataFrame(
+    {
+        "numbers": [3, 2, 1],
+        "names": ["apple", "banana", "coconut"],
+    }
+)
+JOB_DB_GDF_WITH_GEOMETRY = geopandas.GeoDataFrame(
+    {
+        "numbers": [11, 22],
+        "geometry": [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 1)],
+    },
+)
+JOB_DB_DF_WITH_GEOJSON_STRING = pd.DataFrame(
+    {
+        "numbers": [11, 22],
+        "geometry": ['{"type":"Point","coordinates":[1,2]}', '{"type":"Point","coordinates":[1,2]}'],
+    }
+)
+
+
+class TestFullDataFrameJobDatabase:
+    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
+    def test_initialize_from_df(self, tmp_path, db_class):
+        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
+        path = tmp_path / "jobs.db"
+
+        db = db_class(path)
+        assert not path.exists()
+        db.initialize_from_df(orig_df)
+        assert path.exists()
+
+        # Check persisted CSV
+        assert path.exists()
+        expected_columns = {
+            "some_number",
+            "status",
+            "id",
+            "start_time",
+            "running_start_time",
+            "cpu",
+            "memory",
+            "duration",
+            "backend_name",
+            "costs",
+        }
+
+        actual_columns = set(db_class(path).read().columns)
+        assert actual_columns == expected_columns
+
+    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
+    def test_initialize_from_df_on_exists_error(self, tmp_path, db_class):
+        df = pd.DataFrame({"some_number": [3, 2, 1]})
+        path = tmp_path / "jobs.csv"
+        _ = db_class(path).initialize_from_df(df, on_exists="error")
+        assert path.exists()
+
+        with pytest.raises(FileExistsError, match="Job database.* already exists"):
+            _ = db_class(path).initialize_from_df(df, on_exists="error")
+
+        assert set(db_class(path).read()["some_number"]) == {1, 2, 3}
+
+    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
+    def test_initialize_from_df_on_exists_skip(self, tmp_path, db_class):
+        path = tmp_path / "jobs.db"
+
+        db = db_class(path).initialize_from_df(
+            pd.DataFrame({"some_number": [3, 2, 1]}),
+            on_exists="skip",
+        )
+        assert set(db.read()["some_number"]) == {1, 2, 3}
+
+        db = db_class(path).initialize_from_df(
+            pd.DataFrame({"some_number": [444, 555, 666]}),
+            on_exists="skip",
+        )
+        assert set(db.read()["some_number"]) == {1, 2, 3}
+
+    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
+    def test_count_by_status(self, tmp_path, db_class):
+        path = tmp_path / "jobs.db"
+
+        db = db_class(path).initialize_from_df(
+            pd.DataFrame(
+                {
+                    "status": [
+                        "not_started",
+                        "created",
+                        "queued",
+                        "queued",
+                        "queued",
+                        "running",
+                        "running",
+                        "finished",
+                        "finished",
+                        "error",
+                    ]
+                }
+            )
+        )
+        assert db.count_by_status(statuses=["not_started"]) == {"not_started": 1}
+        assert db.count_by_status(statuses=("not_started", "running")) == {"not_started": 1, "running": 2}
+        assert db.count_by_status(statuses={"finished", "error"}) == {"error": 1, "finished": 2}
+
+        # All statuses by default
+        assert db.count_by_status() == {
+            "created": 1,
+            "error": 1,
+            "finished": 2,
+            "not_started": 1,
+            "queued": 3,
+            "running": 2,
+        }
+
+
+class TestCsvJobDatabase:
+    def test_repr(self, tmp_path):
+        path = tmp_path / "db.csv"
+        db = CsvJobDatabase(path)
+        assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", repr(db))
+        assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", str(db))
+
+    def test_read_wkt(self, tmp_path):
+        wkt_df = pd.DataFrame(
+            {
+                "value": ["wkt"],
+                "geometry": ["POINT (30 10)"],
+            }
+        )
+        path = tmp_path / "jobs.csv"
+        wkt_df.to_csv(path, index=False)
+        df = CsvJobDatabase(path).read()
+        assert isinstance(df.geometry[0], shapely.geometry.Point)
+
+    def test_read_non_wkt(self, tmp_path):
+        non_wkt_df = pd.DataFrame(
+            {
+                "value": ["non_wkt"],
+                "geometry": ["this is no WKT"],
+            }
+        )
+        path = tmp_path / "jobs.csv"
+        non_wkt_df.to_csv(path, index=False)
+        df = CsvJobDatabase(path).read()
+        assert isinstance(df.geometry[0], str)
+
+    @pytest.mark.parametrize(
+        ["orig"],
+        [
+            pytest.param(JOB_DB_DF_BASICS, id="pandas basics"),
+            pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"),
+            pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"),
+        ],
+    )
+    def test_persist_and_read(self, tmp_path, orig: pd.DataFrame):
+        path = tmp_path / "jobs.parquet"
+        CsvJobDatabase(path).persist(orig)
+        assert path.exists()
+
+        loaded = CsvJobDatabase(path).read()
+        assert loaded.dtypes.to_dict() == orig.dtypes.to_dict()
+        assert loaded.equals(orig)
+        assert type(orig) is type(loaded)
+
+    @pytest.mark.parametrize(
+        ["orig"],
+        [
+            pytest.param(JOB_DB_DF_BASICS, id="pandas basics"),
+            pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"),
+            pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"),
+        ],
+    )
+    def test_partial_read_write(self, tmp_path, orig: pd.DataFrame):
+        path = tmp_path / "jobs.csv"
+
+        required_with_default = [
+            ("status", "not_started"),
+            ("id", None),
+            ("start_time", None),
+        ]
+        new_columns = {col: val for (col, val) in required_with_default if col not in orig.columns}
+        orig = orig.assign(**new_columns)
+
+        db = CsvJobDatabase(path)
+        db.persist(orig)
+        assert path.exists()
+
+        loaded = db.get_by_status(statuses=["not_started"], max=2)
+        assert db.count_by_status(statuses=["not_started"])["not_started"] > 1
+
+        assert len(loaded) == 2
+        loaded.loc[0, "status"] = "running"
+        loaded.loc[1, "status"] = "error"
+        db.persist(loaded)
+        assert db.count_by_status(statuses=["error"])["error"] == 1
+
+        all = db.read()
+        assert len(all) == len(orig)
+        assert all.loc[0, "status"] == "running"
+        assert all.loc[1, "status"] == "error"
+        if len(all) > 2:
+            assert all.loc[2, "status"] == "not_started"
+        print(loaded.index)
+
+    def test_initialize_from_df(self, tmp_path):
+        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
+        path = tmp_path / "jobs.csv"
+
+        # Initialize the CSV from the dataframe
+        _ = CsvJobDatabase(path).initialize_from_df(orig_df)
+
+        # Check persisted CSV
+        assert path.exists()
+        expected_columns = {
+            "some_number",
+            "status",
+            "id",
+            "start_time",
+            "running_start_time",
+            "cpu",
+            "memory",
+            "duration",
+            "backend_name",
+            "costs",
+        }
+
+        # Raw file content check
+        raw_columns = set(path.read_text().split("\n")[0].split(","))
+        # Higher level read
+        read_columns = set(CsvJobDatabase(path).read().columns)
+
+        assert raw_columns == expected_columns
+        assert read_columns == expected_columns
+
+    def test_initialize_from_df_on_exists_error(self, tmp_path):
+        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
+        path = tmp_path / "jobs.csv"
+        _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error")
+        with pytest.raises(FileExistsError, match="Job database.* already exists"):
+            _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error")
+
+    def test_initialize_from_df_on_exists_skip(self, tmp_path):
+        path = tmp_path / "jobs.csv"
+
+        db = CsvJobDatabase(path).initialize_from_df(
+            pd.DataFrame({"some_number": [3, 2, 1]}),
+            on_exists="skip",
+        )
+        assert set(db.read()["some_number"]) == {1, 2, 3}
+
+        db = CsvJobDatabase(path).initialize_from_df(
+            pd.DataFrame({"some_number": [444, 555, 666]}),
+            on_exists="skip",
+        )
+        assert set(db.read()["some_number"]) == {1, 2, 3}
+
+    @pytest.mark.skipif(
+        ComparableVersion(geopandas.__version__) < "0.14",
+        reason="This issue has no workaround with geopandas < 0.14 (highest available version on Python 3.8 is 0.13.2)",
+    )
+    def test_read_with_crs_column(self, tmp_path):
+        """
+        Having a column named "crs" can cause obscure error messages when creating a GeoPandas dataframe
+        https://github.com/Open-EO/openeo-python-client/issues/714
+        """
+        source_df = pd.DataFrame(
+            {
+                "crs": [1234],
+                "geometry": ["Point(2 3)"],
+            }
+        )
+        path = tmp_path / "jobs.csv"
+        source_df.to_csv(path, index=False)
+        result_df = CsvJobDatabase(path).read()
+        assert isinstance(result_df, geopandas.GeoDataFrame)
+        assert result_df.to_dict(orient="list") == {
+            "crs": [1234],
+            "geometry": [shapely.geometry.Point(2, 3)],
+        }
+
+
+class TestParquetJobDatabase:
+    def test_repr(self, tmp_path):
+        path = tmp_path / "db.pq"
+        db = ParquetJobDatabase(path)
+        assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", repr(db))
+        assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", str(db))
+
+    @pytest.mark.parametrize(
+        ["orig"],
+        [
+            pytest.param(JOB_DB_DF_BASICS, id="pandas basics"),
+            pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"),
+            pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"),
+        ],
+    )
+    def test_persist_and_read(self, tmp_path, orig: pd.DataFrame):
+        path = tmp_path / "jobs.parquet"
+        ParquetJobDatabase(path).persist(orig)
+        assert path.exists()
+
+        loaded = ParquetJobDatabase(path).read()
+        assert loaded.dtypes.to_dict() == orig.dtypes.to_dict()
+        assert loaded.equals(orig)
+        assert type(orig) is type(loaded)
+
+    def test_initialize_from_df(self, tmp_path):
+        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
+        path = tmp_path / "jobs.parquet"
+
+        # Initialize the CSV from the dataframe
+        _ = ParquetJobDatabase(path).initialize_from_df(orig_df)
+
+        # Check persisted CSV
+        assert path.exists()
+        expected_columns = {
+            "some_number",
+            "status",
+            "id",
+            "start_time",
+            "running_start_time",
+            "cpu",
+            "memory",
+            "duration",
+            "backend_name",
+            "costs",
+        }
+
+        df_from_disk = ParquetJobDatabase(path).read()
+        assert set(df_from_disk.columns) == expected_columns
+
+
+@pytest.mark.parametrize(
+    ["filename", "expected"],
+    [
+        ("jobz.csv", CsvJobDatabase),
+        ("jobz.parquet", ParquetJobDatabase),
+    ],
+)
+def test_get_job_db(tmp_path, filename, expected):
+    path = tmp_path / filename
+    db = get_job_db(path)
+    assert isinstance(db, expected)
+    assert not path.exists()
+
+
+@pytest.mark.parametrize(
+    ["filename", "expected"],
+    [
+        ("jobz.csv", CsvJobDatabase),
+        ("jobz.parquet", ParquetJobDatabase),
+    ],
+)
+def test_create_job_db(tmp_path, filename, expected):
+    df = pd.DataFrame({"year": [2023, 2024]})
+    path = tmp_path / filename
+    db = create_job_db(path=path, df=df)
+    assert isinstance(db, expected)
+    assert path.exists()
\ No newline at end of file
diff --git a/tests/extra/job_management/test_job_management.py b/tests/extra/job_management/test_manager.py
similarity index 73%
rename from tests/extra/job_management/test_job_management.py
rename to tests/extra/job_management/test_manager.py
index 58901c05a..58fcf2245 100644
--- a/tests/extra/job_management/test_job_management.py
+++ b/tests/extra/job_management/test_manager.py
@@ -11,7 +11,6 @@
 from unittest import mock
 
 import dirty_equals
-import geopandas
 
 # TODO: can we avoid using httpretty?
 #   We need it for testing the resilience, which uses an HTTPadapter with Retry
@@ -21,23 +20,21 @@
 #   But I would rather not have two dependencies with almost the same goal.
 import httpretty
 import numpy as np
-import pandas
 import pandas as pd
-import pandas.testing
 import pytest
 import requests
-import shapely.geometry
 
 import openeo
-import openeo.extra.job_management
 from openeo import BatchJob
-from openeo.extra.job_management import (
+from openeo.extra.job_management._manager import (
     MAX_RETRIES,
-    CsvJobDatabase,
     MultiBackendJobManager,
+)
+
+from openeo.extra.job_management._job_db import (
+    CsvJobDatabase,
     ParquetJobDatabase,
     create_job_db,
-    get_job_db,
 )
 from openeo.extra.job_management._thread_worker import (
     Task,
@@ -47,7 +44,6 @@
 from openeo.rest._testing import DummyBackend
 from openeo.rest.auth.testing import OidcMock
 from openeo.util import rfc3339
-from openeo.utils.version import ComparableVersion
 
 
 def _job_id_from_year(process_graph) -> Union[str, None]:
@@ -608,7 +604,7 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to not actually sleep, but skip one hour at a time
-        with mock.patch.object(openeo.extra.job_management.time, "sleep", new=lambda s: time_machine.shift(60 * 60)):
+        with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()
@@ -727,7 +723,7 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to skip one hour at a time instead of actually sleeping
-        with mock.patch.object(openeo.extra.job_management.time, "sleep", new=lambda s: time_machine.shift(60 * 60)):
+        with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()
@@ -759,9 +755,9 @@ def test_process_threadworker_updates(self, tmp_path, caplog):
         mgr._process_threadworker_updates(worker_pool=pool, job_db=job_db, stats=stats)
 
         df_final = job_db.read()
-        pandas.testing.assert_frame_equal(
+        pd.testing.assert_frame_equal(
             df_final[["id", "status"]],
-            pandas.DataFrame(
+            pd.DataFrame(
                 {
                     "id": ["j-0", "j-1", "j-2", "j-3"],
                     "status": ["queued", "queued", "created", "created"],
@@ -796,9 +792,9 @@ def test_process_threadworker_updates_unknown(self, tmp_path, caplog):
         mgr._process_threadworker_updates(worker_pool=pool, job_db=job_db, stats=stats)
 
         df_final = job_db.read()
-        pandas.testing.assert_frame_equal(
+        pd.testing.assert_frame_equal(
             df_final[["id", "status"]],
-            pandas.DataFrame(
+            pd.DataFrame(
                 {
                     "id": ["j-123", "j-456"],
                     "status": ["queued", "created"],
@@ -908,361 +904,4 @@ def test_refresh_bearer_token_before_start(
         assert len(oidc_mock.grant_request_history) == 4
 
 
-JOB_DB_DF_BASICS = pd.DataFrame(
-    {
-        "numbers": [3, 2, 1],
-        "names": ["apple", "banana", "coconut"],
-    }
-)
-JOB_DB_GDF_WITH_GEOMETRY = geopandas.GeoDataFrame(
-    {
-        "numbers": [11, 22],
-        "geometry": [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 1)],
-    },
-)
-JOB_DB_DF_WITH_GEOJSON_STRING = pd.DataFrame(
-    {
-        "numbers": [11, 22],
-        "geometry": ['{"type":"Point","coordinates":[1,2]}', '{"type":"Point","coordinates":[1,2]}'],
-    }
-)
-
-
-class TestFullDataFrameJobDatabase:
-    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
-    def test_initialize_from_df(self, tmp_path, db_class):
-        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
-        path = tmp_path / "jobs.db"
-
-        db = db_class(path)
-        assert not path.exists()
-        db.initialize_from_df(orig_df)
-        assert path.exists()
-
-        # Check persisted CSV
-        assert path.exists()
-        expected_columns = {
-            "some_number",
-            "status",
-            "id",
-            "start_time",
-            "running_start_time",
-            "cpu",
-            "memory",
-            "duration",
-            "backend_name",
-            "costs",
-        }
-
-        actual_columns = set(db_class(path).read().columns)
-        assert actual_columns == expected_columns
-
-    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
-    def test_initialize_from_df_on_exists_error(self, tmp_path, db_class):
-        df = pd.DataFrame({"some_number": [3, 2, 1]})
-        path = tmp_path / "jobs.csv"
-        _ = db_class(path).initialize_from_df(df, on_exists="error")
-        assert path.exists()
-
-        with pytest.raises(FileExistsError, match="Job database.* already exists"):
-            _ = db_class(path).initialize_from_df(df, on_exists="error")
-
-        assert set(db_class(path).read()["some_number"]) == {1, 2, 3}
-
-    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
-    def test_initialize_from_df_on_exists_skip(self, tmp_path, db_class):
-        path = tmp_path / "jobs.db"
-
-        db = db_class(path).initialize_from_df(
-            pd.DataFrame({"some_number": [3, 2, 1]}),
-            on_exists="skip",
-        )
-        assert set(db.read()["some_number"]) == {1, 2, 3}
-
-        db = db_class(path).initialize_from_df(
-            pd.DataFrame({"some_number": [444, 555, 666]}),
-            on_exists="skip",
-        )
-        assert set(db.read()["some_number"]) == {1, 2, 3}
-
-    @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase])
-    def test_count_by_status(self, tmp_path, db_class):
-        path = tmp_path / "jobs.db"
-
-        db = db_class(path).initialize_from_df(
-            pd.DataFrame(
-                {
-                    "status": [
-                        "not_started",
-                        "created",
-                        "queued",
-                        "queued",
-                        "queued",
-                        "running",
-                        "running",
-                        "finished",
-                        "finished",
-                        "error",
-                    ]
-                }
-            )
-        )
-        assert db.count_by_status(statuses=["not_started"]) == {"not_started": 1}
-        assert db.count_by_status(statuses=("not_started", "running")) == {"not_started": 1, "running": 2}
-        assert db.count_by_status(statuses={"finished", "error"}) == {"error": 1, "finished": 2}
-
-        # All statuses by default
-        assert db.count_by_status() == {
-            "created": 1,
-            "error": 1,
-            "finished": 2,
-            "not_started": 1,
-            "queued": 3,
-            "running": 2,
-        }
-
-
-class TestCsvJobDatabase:
-    def test_repr(self, tmp_path):
-        path = tmp_path / "db.csv"
-        db = CsvJobDatabase(path)
-        assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", repr(db))
-        assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", str(db))
-
-    def test_read_wkt(self, tmp_path):
-        wkt_df = pd.DataFrame(
-            {
-                "value": ["wkt"],
-                "geometry": ["POINT (30 10)"],
-            }
-        )
-        path = tmp_path / "jobs.csv"
-        wkt_df.to_csv(path, index=False)
-        df = CsvJobDatabase(path).read()
-        assert isinstance(df.geometry[0], shapely.geometry.Point)
-
-    def test_read_non_wkt(self, tmp_path):
-        non_wkt_df = pd.DataFrame(
-            {
-                "value": ["non_wkt"],
-                "geometry": ["this is no WKT"],
-            }
-        )
-        path = tmp_path / "jobs.csv"
-        non_wkt_df.to_csv(path, index=False)
-        df = CsvJobDatabase(path).read()
-        assert isinstance(df.geometry[0], str)
-
-    @pytest.mark.parametrize(
-        ["orig"],
-        [
-            pytest.param(JOB_DB_DF_BASICS, id="pandas basics"),
-            pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"),
-            pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"),
-        ],
-    )
-    def test_persist_and_read(self, tmp_path, orig: pandas.DataFrame):
-        path = tmp_path / "jobs.parquet"
-        CsvJobDatabase(path).persist(orig)
-        assert path.exists()
-
-        loaded = CsvJobDatabase(path).read()
-        assert loaded.dtypes.to_dict() == orig.dtypes.to_dict()
-        assert loaded.equals(orig)
-        assert type(orig) is type(loaded)
-
-    @pytest.mark.parametrize(
-        ["orig"],
-        [
-            pytest.param(JOB_DB_DF_BASICS, id="pandas basics"),
-            pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"),
-            pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"),
-        ],
-    )
-    def test_partial_read_write(self, tmp_path, orig: pandas.DataFrame):
-        path = tmp_path / "jobs.csv"
-
-        required_with_default = [
-            ("status", "not_started"),
-            ("id", None),
-            ("start_time", None),
-        ]
-        new_columns = {col: val for (col, val) in required_with_default if col not in orig.columns}
-        orig = orig.assign(**new_columns)
-
-        db = CsvJobDatabase(path)
-        db.persist(orig)
-        assert path.exists()
-
-        loaded = db.get_by_status(statuses=["not_started"], max=2)
-        assert db.count_by_status(statuses=["not_started"])["not_started"] > 1
-
-        assert len(loaded) == 2
-        loaded.loc[0, "status"] = "running"
-        loaded.loc[1, "status"] = "error"
-        db.persist(loaded)
-        assert db.count_by_status(statuses=["error"])["error"] == 1
-
-        all = db.read()
-        assert len(all) == len(orig)
-        assert all.loc[0, "status"] == "running"
-        assert all.loc[1, "status"] == "error"
-        if len(all) > 2:
-            assert all.loc[2, "status"] == "not_started"
-        print(loaded.index)
-
-    def test_initialize_from_df(self, tmp_path):
-        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
-        path = tmp_path / "jobs.csv"
-
-        # Initialize the CSV from the dataframe
-        _ = CsvJobDatabase(path).initialize_from_df(orig_df)
-
-        # Check persisted CSV
-        assert path.exists()
-        expected_columns = {
-            "some_number",
-            "status",
-            "id",
-            "start_time",
-            "running_start_time",
-            "cpu",
-            "memory",
-            "duration",
-            "backend_name",
-            "costs",
-        }
-
-        # Raw file content check
-        raw_columns = set(path.read_text().split("\n")[0].split(","))
-        # Higher level read
-        read_columns = set(CsvJobDatabase(path).read().columns)
-
-        assert raw_columns == expected_columns
-        assert read_columns == expected_columns
 
-    def test_initialize_from_df_on_exists_error(self, tmp_path):
-        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
-        path = tmp_path / "jobs.csv"
-        _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error")
-        with pytest.raises(FileExistsError, match="Job database.* already exists"):
-            _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error")
-
-    def test_initialize_from_df_on_exists_skip(self, tmp_path):
-        path = tmp_path / "jobs.csv"
-
-        db = CsvJobDatabase(path).initialize_from_df(
-            pd.DataFrame({"some_number": [3, 2, 1]}),
-            on_exists="skip",
-        )
-        assert set(db.read()["some_number"]) == {1, 2, 3}
-
-        db = CsvJobDatabase(path).initialize_from_df(
-            pd.DataFrame({"some_number": [444, 555, 666]}),
-            on_exists="skip",
-        )
-        assert set(db.read()["some_number"]) == {1, 2, 3}
-
-    @pytest.mark.skipif(
-        ComparableVersion(geopandas.__version__) < "0.14",
-        reason="This issue has no workaround with geopandas < 0.14 (highest available version on Python 3.8 is 0.13.2)",
-    )
-    def test_read_with_crs_column(self, tmp_path):
-        """
-        Having a column named "crs" can cause obscure error messages when creating a GeoPandas dataframe
-        https://github.com/Open-EO/openeo-python-client/issues/714
-        """
-        source_df = pd.DataFrame(
-            {
-                "crs": [1234],
-                "geometry": ["Point(2 3)"],
-            }
-        )
-        path = tmp_path / "jobs.csv"
-        source_df.to_csv(path, index=False)
-        result_df = CsvJobDatabase(path).read()
-        assert isinstance(result_df, geopandas.GeoDataFrame)
-        assert result_df.to_dict(orient="list") == {
-            "crs": [1234],
-            "geometry": [shapely.geometry.Point(2, 3)],
-        }
-
-
-class TestParquetJobDatabase:
-    def test_repr(self, tmp_path):
-        path = tmp_path / "db.pq"
-        db = ParquetJobDatabase(path)
-        assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", repr(db))
-        assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", str(db))
-
-    @pytest.mark.parametrize(
-        ["orig"],
-        [
-            pytest.param(JOB_DB_DF_BASICS, id="pandas basics"),
-            pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"),
-            pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"),
-        ],
-    )
-    def test_persist_and_read(self, tmp_path, orig: pandas.DataFrame):
-        path = tmp_path / "jobs.parquet"
-        ParquetJobDatabase(path).persist(orig)
-        assert path.exists()
-
-        loaded = ParquetJobDatabase(path).read()
-        assert loaded.dtypes.to_dict() == orig.dtypes.to_dict()
-        assert loaded.equals(orig)
-        assert type(orig) is type(loaded)
-
-    def test_initialize_from_df(self, tmp_path):
-        orig_df = pd.DataFrame({"some_number": [3, 2, 1]})
-        path = tmp_path / "jobs.parquet"
-
-        # Initialize the CSV from the dataframe
-        _ = ParquetJobDatabase(path).initialize_from_df(orig_df)
-
-        # Check persisted CSV
-        assert path.exists()
-        expected_columns = {
-            "some_number",
-            "status",
-            "id",
-            "start_time",
-            "running_start_time",
-            "cpu",
-            "memory",
-            "duration",
-            "backend_name",
-            "costs",
-        }
-
-        df_from_disk = ParquetJobDatabase(path).read()
-        assert set(df_from_disk.columns) == expected_columns
-
-
-@pytest.mark.parametrize(
-    ["filename", "expected"],
-    [
-        ("jobz.csv", CsvJobDatabase),
-        ("jobz.parquet", ParquetJobDatabase),
-    ],
-)
-def test_get_job_db(tmp_path, filename, expected):
-    path = tmp_path / filename
-    db = get_job_db(path)
-    assert isinstance(db, expected)
-    assert not path.exists()
-
-
-@pytest.mark.parametrize(
-    ["filename", "expected"],
-    [
-        ("jobz.csv", CsvJobDatabase),
-        ("jobz.parquet", ParquetJobDatabase),
-    ],
-)
-def test_create_job_db(tmp_path, filename, expected):
-    df = pd.DataFrame({"year": [2023, 2024]})
-    path = tmp_path / filename
-    db = create_job_db(path=path, df=df)
-    assert isinstance(db, expected)
-    assert path.exists()

From 7809ca52f10b5ccf135febff7cc5e1d5e269f400 Mon Sep 17 00:00:00 2001
From: Hans Vanrompay <hansvanrompay@hotmail.com>
Date: Wed, 5 Nov 2025 15:52:54 +0100
Subject: [PATCH 08/10] avoid circular import

---
 openeo/extra/job_management/_df_schema.py | 42 +++++++++++++++++++++
 openeo/extra/job_management/_job_db.py    | 45 +++++++++++++++++++++--
 openeo/extra/job_management/_manager.py   | 34 ++---------------
 3 files changed, 87 insertions(+), 34 deletions(-)
 create mode 100644 openeo/extra/job_management/_df_schema.py

diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py
new file mode 100644
index 000000000..5140a5d04
--- /dev/null
+++ b/openeo/extra/job_management/_df_schema.py
@@ -0,0 +1,42 @@
+import dataclasses
+from typing import Any, Mapping
+import pandas as pd
+
+@dataclasses.dataclass(frozen=True)
+class _ColumnProperties:
+    """Expected/required properties of a column in the job manager related dataframes"""
+
+    dtype: str = "object"
+    default: Any = None
+
+# Expected columns in the job DB dataframes.
+# TODO: make this part of public API when settled?
+# TODO: move non official statuses to seperate column (not_started, queued_for_start)
+_COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
+    "id": _ColumnProperties(dtype="str"),
+    "backend_name": _ColumnProperties(dtype="str"),
+    "status": _ColumnProperties(dtype="str", default="not_started"),
+    # TODO: use proper date/time dtype instead of legacy str for start times?
+    "start_time": _ColumnProperties(dtype="str"),
+    "running_start_time": _ColumnProperties(dtype="str"),
+    # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
+    #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
+    #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
+    "cpu": _ColumnProperties(dtype="str"),
+    "memory": _ColumnProperties(dtype="str"),
+    "duration": _ColumnProperties(dtype="str"),
+    "costs": _ColumnProperties(dtype="float64"),
+}
+
+def _normalize(df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Normalize given pandas dataframe (creating a new one):
+        ensure we have the required columns.
+
+        :param df: The dataframe to normalize.
+        :return: a new dataframe that is normalized.
+        """
+        new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns}
+        df = df.assign(**new_columns)
+
+        return df
\ No newline at end of file
diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py
index c6e3521fd..b0bb43ab9 100644
--- a/openeo/extra/job_management/_job_db.py
+++ b/openeo/extra/job_management/_job_db.py
@@ -7,7 +7,7 @@
 import pandas as pd
 
 from openeo.extra.job_management._interface import JobDatabaseInterface
-from openeo.extra.job_management._manager import MultiBackendJobManager
+from openeo.extra.job_management._df_schema import _normalize, _COLUMN_REQUIREMENTS
 
 _log = logging.getLogger(__name__)
 
@@ -40,7 +40,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
             else:
                 # TODO handle other on_exists modes: e.g. overwrite, merge, ...
                 raise ValueError(f"Invalid on_exists={on_exists!r}")
-        df = MultiBackendJobManager._normalize_df(df)
+        df = _normalize(df)
         self.persist(df)
         # Return self to allow chaining with constructor.
         return self
@@ -133,7 +133,7 @@ def read(self) -> pd.DataFrame:
         df = pd.read_csv(
             self.path,
             # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
-            dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()},
+            dtype={c: r.dtype for (c, r) in _COLUMN_REQUIREMENTS.items()},
         )
         if (
             "geometry" in df.columns
@@ -203,3 +203,42 @@ def persist(self, df: pd.DataFrame):
         self.df.to_parquet(self.path, index=False)
 
 
+def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
+    """
+    Factory to get a job database at a given path,
+    guessing the database type from filename extension.
+
+    :param path: path to job database file.
+
+    .. versionadded:: 0.33.0
+    """
+    path = Path(path)
+    if path.suffix.lower() in {".csv"}:
+        job_db = CsvJobDatabase(path=path)
+    elif path.suffix.lower() in {".parquet", ".geoparquet"}:
+        job_db = ParquetJobDatabase(path=path)
+    else:
+        raise ValueError(f"Could not guess job database type from {path!r}")
+    return job_db
+
+
+def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
+    """
+    Factory to create a job database at given path,
+    initialized from a given dataframe,
+    and its database type guessed from filename extension.
+
+    :param path: Path to the job database file.
+    :param df: DataFrame to store in the job database.
+    :param on_exists: What to do when the job database already exists:
+        - "error": (default) raise an exception
+        - "skip": work with existing database, ignore given dataframe and skip any initialization
+
+    .. versionadded:: 0.33.0
+    """
+    job_db = get_job_db(path)
+    if isinstance(job_db, FullDataFrameJobDatabase):
+        job_db.initialize_from_df(df=df, on_exists=on_exists)
+    else:
+        raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
+    return job_db
\ No newline at end of file
diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py
index febb15bae..265ab6277 100644
--- a/openeo/extra/job_management/_manager.py
+++ b/openeo/extra/job_management/_manager.py
@@ -13,7 +13,6 @@
     Callable,
     Dict,
     List,
-    Mapping,
     NamedTuple,
     Optional,
     Tuple,
@@ -31,7 +30,8 @@
     _JobStartTask,
 )
 from openeo.extra.job_management._interface import JobDatabaseInterface
-#from openeo.extra.job_management._job_db import get_job_db #TODO circular import
+from openeo.extra.job_management._job_db import get_job_db 
+from openeo.extra.job_management._df_schema import _normalize
 
 from openeo.rest import OpenEoApiError
 from openeo.rest.auth.auth import BearerAuth
@@ -55,13 +55,6 @@ class _Backend(NamedTuple):
     # Maximum number of jobs to allow in parallel on a backend
     parallel_jobs: int
 
-@dataclasses.dataclass(frozen=True)
-class _ColumnProperties:
-    """Expected/required properties of a column in the job manager related dataframes"""
-
-    dtype: str = "object"
-    default: Any = None
-
 
 
 
@@ -132,24 +125,6 @@ def start_job(
         Added ``cancel_running_job_after`` parameter.
     """
 
-    # Expected columns in the job DB dataframes.
-    # TODO: make this part of public API when settled?
-    # TODO: move non official statuses to seperate column (not_started, queued_for_start)
-    _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
-        "id": _ColumnProperties(dtype="str"),
-        "backend_name": _ColumnProperties(dtype="str"),
-        "status": _ColumnProperties(dtype="str", default="not_started"),
-        # TODO: use proper date/time dtype instead of legacy str for start times?
-        "start_time": _ColumnProperties(dtype="str"),
-        "running_start_time": _ColumnProperties(dtype="str"),
-        # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
-        #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
-        #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
-        "cpu": _ColumnProperties(dtype="str"),
-        "memory": _ColumnProperties(dtype="str"),
-        "duration": _ColumnProperties(dtype="str"),
-        "costs": _ColumnProperties(dtype="float64"),
-    }
 
     def __init__(
         self,
@@ -259,10 +234,7 @@ def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
         :param df: The dataframe to normalize.
         :return: a new dataframe that is normalized.
         """
-        new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns}
-        df = df.assign(**new_columns)
-
-        return df
+        return _normalize(df)
 
     def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface):
         """

From fa4f7538d60ede64e4f9c7860ea000bed8ab8632 Mon Sep 17 00:00:00 2001
From: Stefaan Lippens <stefaan.lippens@vito.be>
Date: Wed, 5 Nov 2025 16:37:19 +0100
Subject: [PATCH 09/10] ruff based code/import cleanup #741/#815

---
 openeo/extra/job_management/__init__.py    | 14 ++++++++-----
 openeo/extra/job_management/_df_schema.py  | 22 ++++++++++++---------
 openeo/extra/job_management/_interface.py  | 14 ++-----------
 openeo/extra/job_management/_job_db.py     | 10 ++++++----
 openeo/extra/job_management/_manager.py    | 21 ++++++++++----------
 tests/extra/job_management/test_job_db.py  |  5 ++---
 tests/extra/job_management/test_manager.py | 23 +++++++++++-----------
 7 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
index dd42ac1a8..eda8b1ff7 100644
--- a/openeo/extra/job_management/__init__.py
+++ b/openeo/extra/job_management/__init__.py
@@ -1,7 +1,13 @@
 from openeo.extra.job_management._interface import JobDatabaseInterface
-from openeo.extra.job_management._job_db import FullDataFrameJobDatabase, CsvJobDatabase, ParquetJobDatabase, create_job_db, get_job_db
-from openeo.extra.job_management.process_based import ProcessBasedJobCreator
+from openeo.extra.job_management._job_db import (
+    CsvJobDatabase,
+    FullDataFrameJobDatabase,
+    ParquetJobDatabase,
+    create_job_db,
+    get_job_db,
+)
 from openeo.extra.job_management._manager import MultiBackendJobManager
+from openeo.extra.job_management.process_based import ProcessBasedJobCreator
 
 __all__ = [
     "JobDatabaseInterface",
@@ -11,7 +17,5 @@
     "ProcessBasedJobCreator",
     "create_job_db",
     "get_job_db",
-    "MultiBackendJobManager"
+    "MultiBackendJobManager",
 ]
-
-
diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py
index 5140a5d04..a17015390 100644
--- a/openeo/extra/job_management/_df_schema.py
+++ b/openeo/extra/job_management/_df_schema.py
@@ -1,7 +1,9 @@
 import dataclasses
 from typing import Any, Mapping
+
 import pandas as pd
 
+
 @dataclasses.dataclass(frozen=True)
 class _ColumnProperties:
     """Expected/required properties of a column in the job manager related dataframes"""
@@ -9,6 +11,7 @@ class _ColumnProperties:
     dtype: str = "object"
     default: Any = None
 
+
 # Expected columns in the job DB dataframes.
 # TODO: make this part of public API when settled?
 # TODO: move non official statuses to seperate column (not_started, queued_for_start)
@@ -28,15 +31,16 @@ class _ColumnProperties:
     "costs": _ColumnProperties(dtype="float64"),
 }
 
+
 def _normalize(df: pd.DataFrame) -> pd.DataFrame:
-        """
-        Normalize given pandas dataframe (creating a new one):
-        ensure we have the required columns.
+    """
+    Normalize given pandas dataframe (creating a new one):
+    ensure we have the required columns.
 
-        :param df: The dataframe to normalize.
-        :return: a new dataframe that is normalized.
-        """
-        new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns}
-        df = df.assign(**new_columns)
+    :param df: The dataframe to normalize.
+    :return: a new dataframe that is normalized.
+    """
+    new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns}
+    df = df.assign(**new_columns)
 
-        return df
\ No newline at end of file
+    return df
diff --git a/openeo/extra/job_management/_interface.py b/openeo/extra/job_management/_interface.py
index 32f6c9d0b..215a1cd3a 100644
--- a/openeo/extra/job_management/_interface.py
+++ b/openeo/extra/job_management/_interface.py
@@ -1,14 +1,9 @@
 import abc
-from typing import (
-    Callable,
-    Iterable,
-    List,
-    NamedTuple,
-    Union,
-)
+from typing import Iterable, List, Union
 
 import pandas as pd
 
+
 class JobDatabaseInterface(metaclass=abc.ABCMeta):
     """
     Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
@@ -68,8 +63,3 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
         :return: DataFrame with jobs filtered by indices.
         """
         ...
-
-
-
-
-
diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py
index b0bb43ab9..a0be53443 100644
--- a/openeo/extra/job_management/_job_db.py
+++ b/openeo/extra/job_management/_job_db.py
@@ -1,16 +1,18 @@
 import abc
 import logging
-import shapely.errors
-import shapely.wkt
 from pathlib import Path
 from typing import Iterable, Union
+
 import pandas as pd
+import shapely.errors
+import shapely.wkt
 
+from openeo.extra.job_management._df_schema import _COLUMN_REQUIREMENTS, _normalize
 from openeo.extra.job_management._interface import JobDatabaseInterface
-from openeo.extra.job_management._df_schema import _normalize, _COLUMN_REQUIREMENTS
 
 _log = logging.getLogger(__name__)
 
+
 class FullDataFrameJobDatabase(JobDatabaseInterface):
     def __init__(self):
         super().__init__()
@@ -241,4 +243,4 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str =
         job_db.initialize_from_df(df=df, on_exists=on_exists)
     else:
         raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
-    return job_db
\ No newline at end of file
+    return job_db
diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py
index 265ab6277..a76d3a7ff 100644
--- a/openeo/extra/job_management/_manager.py
+++ b/openeo/extra/job_management/_manager.py
@@ -1,6 +1,5 @@
 import collections
 import contextlib
-import dataclasses
 import datetime
 import json
 import logging
@@ -25,28 +24,31 @@
 from urllib3.util import Retry
 
 from openeo import BatchJob, Connection
+from openeo.extra.job_management._df_schema import _normalize
+from openeo.extra.job_management._interface import JobDatabaseInterface
+from openeo.extra.job_management._job_db import get_job_db
 from openeo.extra.job_management._thread_worker import (
     _JobManagerWorkerThreadPool,
     _JobStartTask,
 )
-from openeo.extra.job_management._interface import JobDatabaseInterface
-from openeo.extra.job_management._job_db import get_job_db 
-from openeo.extra.job_management._df_schema import _normalize
-
 from openeo.rest import OpenEoApiError
 from openeo.rest.auth.auth import BearerAuth
 from openeo.util import deep_get, rfc3339
 
-
 _log = logging.getLogger(__name__)
 
+
 MAX_RETRIES = 50
+
+
 # Sentinel value to indicate that a parameter was not set
 _UNSET = object()
 
+
 def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
     raise NotImplementedError("No 'start_job' callable provided")
 
+
 class _Backend(NamedTuple):
     """Container for backend info/settings"""
 
@@ -56,8 +58,6 @@ class _Backend(NamedTuple):
     parallel_jobs: int
 
 
-
-
 class MultiBackendJobManager:
     """
     Tracker for multiple jobs on multiple backends.
@@ -125,7 +125,6 @@ def start_job(
         Added ``cancel_running_job_after`` parameter.
     """
 
-
     def __init__(
         self,
         poll_sleep: int = 60,
@@ -409,7 +408,7 @@ def run_jobs(
         assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}"
 
         if isinstance(job_db, (str, Path)):
-            job_db = get_job_db(path=job_db) #TODO circular import
+            job_db = get_job_db(path=job_db)  # TODO circular import
 
         if not isinstance(job_db, JobDatabaseInterface):
             raise ValueError(f"Unsupported job_db {job_db!r}")
@@ -806,7 +805,7 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] =
         job_db.persist(active)
 
         return jobs_done, jobs_error, jobs_cancel
-    
+
 
 def _format_usage_stat(job_metadata: dict, field: str) -> str:
     value = deep_get(job_metadata, "usage", field, "value", default=0)
diff --git a/tests/extra/job_management/test_job_db.py b/tests/extra/job_management/test_job_db.py
index 53ccce8d5..d1f6c5a8f 100644
--- a/tests/extra/job_management/test_job_db.py
+++ b/tests/extra/job_management/test_job_db.py
@@ -1,5 +1,5 @@
-
 import re
+
 import geopandas
 
 # TODO: can we avoid using httpretty?
@@ -18,7 +18,6 @@
     create_job_db,
     get_job_db,
 )
-
 from openeo.utils.version import ComparableVersion
 
 JOB_DB_DF_BASICS = pd.DataFrame(
@@ -378,4 +377,4 @@ def test_create_job_db(tmp_path, filename, expected):
     path = tmp_path / filename
     db = create_job_db(path=path, df=df)
     assert isinstance(db, expected)
-    assert path.exists()
\ No newline at end of file
+    assert path.exists()
diff --git a/tests/extra/job_management/test_manager.py b/tests/extra/job_management/test_manager.py
index 58fcf2245..ffc06e049 100644
--- a/tests/extra/job_management/test_manager.py
+++ b/tests/extra/job_management/test_manager.py
@@ -26,16 +26,15 @@
 
 import openeo
 from openeo import BatchJob
-from openeo.extra.job_management._manager import (
-    MAX_RETRIES,
-    MultiBackendJobManager,
-)
-
 from openeo.extra.job_management._job_db import (
     CsvJobDatabase,
     ParquetJobDatabase,
     create_job_db,
 )
+from openeo.extra.job_management._manager import (
+    MAX_RETRIES,
+    MultiBackendJobManager,
+)
 from openeo.extra.job_management._thread_worker import (
     Task,
     _JobManagerWorkerThreadPool,
@@ -604,7 +603,9 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to not actually sleep, but skip one hour at a time
-        with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)):
+        with mock.patch.object(
+            openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)
+        ):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()
@@ -723,7 +724,9 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to skip one hour at a time instead of actually sleeping
-        with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)):
+        with mock.patch.object(
+            openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)
+        ):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()
@@ -865,9 +868,8 @@ def test_refresh_bearer_token_before_start(
         dummy_backend_bar,
         job_manager_root_dir,
         sleep_mock,
-        requests_mock
+        requests_mock,
     ):
-
         client_id = "client123"
         client_secret = "$3cr3t"
         oidc_issuer = "https://oidc.test/"
@@ -902,6 +904,3 @@ def test_refresh_bearer_token_before_start(
         # Because of proactive+throttled token refreshing,
         # we should have 2 additional token requests now
         assert len(oidc_mock.grant_request_history) == 4
-
-
-

From 71971de40ec33f9629ca1b62ff41e973aa6037d1 Mon Sep 17 00:00:00 2001
From: Stefaan Lippens <stefaan.lippens@vito.be>
Date: Wed, 5 Nov 2025 19:11:31 +0100
Subject: [PATCH 10/10] Tighter encapsulation of "column requirements" of
 MultiBackendJobManager #741/#815

Using another solution to break the import cycle

(and some doc/test tweaks along the way)
---
 openeo/extra/job_management/_df_schema.py  | 46 ------------
 openeo/extra/job_management/_interface.py  |  3 +-
 openeo/extra/job_management/_job_db.py     | 12 ++--
 openeo/extra/job_management/_manager.py    | 83 ++++++++++++++++++----
 openeo/extra/job_management/stac_job_db.py |  8 +--
 tests/extra/job_management/test_job_db.py  |  7 --
 tests/extra/job_management/test_manager.py | 15 ++--
 7 files changed, 87 insertions(+), 87 deletions(-)
 delete mode 100644 openeo/extra/job_management/_df_schema.py

diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py
deleted file mode 100644
index a17015390..000000000
--- a/openeo/extra/job_management/_df_schema.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import dataclasses
-from typing import Any, Mapping
-
-import pandas as pd
-
-
-@dataclasses.dataclass(frozen=True)
-class _ColumnProperties:
-    """Expected/required properties of a column in the job manager related dataframes"""
-
-    dtype: str = "object"
-    default: Any = None
-
-
-# Expected columns in the job DB dataframes.
-# TODO: make this part of public API when settled?
-# TODO: move non official statuses to seperate column (not_started, queued_for_start)
-_COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
-    "id": _ColumnProperties(dtype="str"),
-    "backend_name": _ColumnProperties(dtype="str"),
-    "status": _ColumnProperties(dtype="str", default="not_started"),
-    # TODO: use proper date/time dtype instead of legacy str for start times?
-    "start_time": _ColumnProperties(dtype="str"),
-    "running_start_time": _ColumnProperties(dtype="str"),
-    # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
-    #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
-    #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
-    "cpu": _ColumnProperties(dtype="str"),
-    "memory": _ColumnProperties(dtype="str"),
-    "duration": _ColumnProperties(dtype="str"),
-    "costs": _ColumnProperties(dtype="float64"),
-}
-
-
-def _normalize(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Normalize given pandas dataframe (creating a new one):
-    ensure we have the required columns.
-
-    :param df: The dataframe to normalize.
-    :return: a new dataframe that is normalized.
-    """
-    new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns}
-    df = df.assign(**new_columns)
-
-    return df
diff --git a/openeo/extra/job_management/_interface.py b/openeo/extra/job_management/_interface.py
index 215a1cd3a..6f7b6a9cd 100644
--- a/openeo/extra/job_management/_interface.py
+++ b/openeo/extra/job_management/_interface.py
@@ -6,7 +6,8 @@
 
 class JobDatabaseInterface(metaclass=abc.ABCMeta):
     """
-    Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`,
+    Interface for a database of job metadata to use with the
+    :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager`,
     allowing to regularly persist the job metadata while polling the job statuses
     and resume/restart the job tracking after it was interrupted.
 
diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py
index a0be53443..8e18d5c98 100644
--- a/openeo/extra/job_management/_job_db.py
+++ b/openeo/extra/job_management/_job_db.py
@@ -7,7 +7,7 @@
 import shapely.errors
 import shapely.wkt
 
-from openeo.extra.job_management._df_schema import _COLUMN_REQUIREMENTS, _normalize
+import openeo.extra.job_management._manager
 from openeo.extra.job_management._interface import JobDatabaseInterface
 
 _log = logging.getLogger(__name__)
@@ -22,7 +22,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
         """
         Initialize the job database from a given dataframe,
         which will be first normalized to be compatible
-        with :py:class:`MultiBackendJobManager` usage.
+        with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager` usage.
 
         :param df: dataframe with some columns your ``start_job`` callable expects
         :param on_exists: what to do when the job database already exists (persisted on disk):
@@ -42,7 +42,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
             else:
                 # TODO handle other on_exists modes: e.g. overwrite, merge, ...
                 raise ValueError(f"Invalid on_exists={on_exists!r}")
-        df = _normalize(df)
+        df = openeo.extra.job_management._manager.MultiBackendJobManager._column_requirements.normalize_df(df)
         self.persist(df)
         # Return self to allow chaining with constructor.
         return self
@@ -104,7 +104,7 @@ class CsvJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a CSV file.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface`
     :param path: Path to local CSV file.
 
     .. note::
@@ -135,7 +135,7 @@ def read(self) -> pd.DataFrame:
         df = pd.read_csv(
             self.path,
             # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
-            dtype={c: r.dtype for (c, r) in _COLUMN_REQUIREMENTS.items()},
+            dtype=openeo.extra.job_management._manager.MultiBackendJobManager._column_requirements.dtype_mapping(),
         )
         if (
             "geometry" in df.columns
@@ -159,7 +159,7 @@ class ParquetJobDatabase(FullDataFrameJobDatabase):
     """
     Persist/load job metadata with a Parquet file.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface`
     :param path: Path to the Parquet file.
 
     .. note::
diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py
index a76d3a7ff..e264c8731 100644
--- a/openeo/extra/job_management/_manager.py
+++ b/openeo/extra/job_management/_manager.py
@@ -1,5 +1,6 @@
 import collections
 import contextlib
+import dataclasses
 import datetime
 import json
 import logging
@@ -12,6 +13,7 @@
     Callable,
     Dict,
     List,
+    Mapping,
     NamedTuple,
     Optional,
     Tuple,
@@ -23,10 +25,10 @@
 from requests.adapters import HTTPAdapter
 from urllib3.util import Retry
 
+# TODO avoid this (circular) dependency on _job_db?
+import openeo.extra.job_management._job_db
 from openeo import BatchJob, Connection
-from openeo.extra.job_management._df_schema import _normalize
 from openeo.extra.job_management._interface import JobDatabaseInterface
-from openeo.extra.job_management._job_db import get_job_db
 from openeo.extra.job_management._thread_worker import (
     _JobManagerWorkerThreadPool,
     _JobStartTask,
@@ -38,6 +40,7 @@
 _log = logging.getLogger(__name__)
 
 
+# TODO: eliminate this module constant (should be part of some constructor interface)
 MAX_RETRIES = 50
 
 
@@ -58,6 +61,45 @@ class _Backend(NamedTuple):
     parallel_jobs: int
 
 
+@dataclasses.dataclass(frozen=True)
+class _ColumnProperties:
+    """Expected/required properties of a column in the job manager related dataframes"""
+
+    dtype: str = "object"
+    default: Any = None
+
+
+class _ColumnRequirements:
+    """
+    Helper class to encapsulate the column requirements expected by MultiBackendJobManager.
+    The current implementation (e.g. _job_db) has some undesired coupling here,
+    but it turns out quite hard to eliminate.
+    The goal of this class is, currently, to at least make the coupling explicit
+    in a centralized way.
+    """
+
+    def __init__(self, requirements: Mapping[str, _ColumnProperties]):
+        self._requirements = dict(requirements)
+
+    def normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Normalize given pandas dataframe (creating a new one):
+        ensure we have the required columns.
+
+        :param df: The dataframe to normalize.
+        :return: a new dataframe that is normalized.
+        """
+        new_columns = {col: req.default for (col, req) in self._requirements.items() if col not in df.columns}
+        df = df.assign(**new_columns)
+        return df
+
+    def dtype_mapping(self) -> Dict[str, str]:
+        """
+        Get mapping of column name to expected dtype string, e.g. to be used with pandas.read_csv(dtype=...)
+        """
+        return {col: req.dtype for (col, req) in self._requirements.items()}
+
+
 class MultiBackendJobManager:
     """
     Tracker for multiple jobs on multiple backends.
@@ -125,6 +167,27 @@ def start_job(
         Added ``cancel_running_job_after`` parameter.
     """
 
+    # Expected columns in the job DB dataframes.
+    # TODO: make this part of public API when settled?
+    # TODO: move non official statuses to separate column (not_started, queued_for_start)
+    _column_requirements: _ColumnRequirements = _ColumnRequirements(
+        {
+            "id": _ColumnProperties(dtype="str"),
+            "backend_name": _ColumnProperties(dtype="str"),
+            "status": _ColumnProperties(dtype="str", default="not_started"),
+            # TODO: use proper date/time dtype instead of legacy str for start times?
+            "start_time": _ColumnProperties(dtype="str"),
+            "running_start_time": _ColumnProperties(dtype="str"),
+            # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
+            #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
+            #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
+            "cpu": _ColumnProperties(dtype="str"),
+            "memory": _ColumnProperties(dtype="str"),
+            "duration": _ColumnProperties(dtype="str"),
+            "costs": _ColumnProperties(dtype="float64"),
+        }
+    )
+
     def __init__(
         self,
         poll_sleep: int = 60,
@@ -227,13 +290,9 @@ def _make_resilient(connection):
     @classmethod
     def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Normalize given pandas dataframe (creating a new one):
-        ensure we have the required columns.
-
-        :param df: The dataframe to normalize.
-        :return: a new dataframe that is normalized.
+        Deprecated, but kept for backwards compatibility
         """
-        return _normalize(df)
+        return cls._column_requirements.normalize_df(df)
 
     def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface):
         """
@@ -268,7 +327,7 @@ def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabas
         :param job_db:
             Job database to load/store existing job status data and other metadata from/to.
             Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+            or as a custom database object following the :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` interface.
 
             .. note::
                 Support for Parquet files depends on the ``pyarrow`` package
@@ -373,7 +432,7 @@ def run_jobs(
         :param job_db:
             Job database to load/store existing job status data and other metadata from/to.
             Can be specified as a path to CSV or Parquet file,
-            or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
+            or as a custom database object following the :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` interface.
 
             .. note::
                 Support for Parquet files depends on the ``pyarrow`` package
@@ -389,7 +448,7 @@ def run_jobs(
         .. versionchanged:: 0.31.0
             Replace ``output_file`` argument with ``job_db`` argument,
             which can be a path to a CSV or Parquet file,
-            or a user-defined :py:class:`JobDatabaseInterface` object.
+            or a user-defined :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` object.
             The deprecated ``output_file`` argument is still supported for now.
 
         .. versionchanged:: 0.33.0
@@ -408,7 +467,7 @@ def run_jobs(
         assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}"
 
         if isinstance(job_db, (str, Path)):
-            job_db = get_job_db(path=job_db)  # TODO circular import
+            job_db = openeo.extra.job_management._job_db.get_job_db(path=job_db)
 
         if not isinstance(job_db, JobDatabaseInterface):
             raise ValueError(f"Unsupported job_db {job_db!r}")
diff --git a/openeo/extra/job_management/stac_job_db.py b/openeo/extra/job_management/stac_job_db.py
index bcd25da12..9f1dff8f5 100644
--- a/openeo/extra/job_management/stac_job_db.py
+++ b/openeo/extra/job_management/stac_job_db.py
@@ -22,7 +22,7 @@ class STACAPIJobDatabase(JobDatabaseInterface):
 
     Unstable API, subject to change.
 
-    :implements: :py:class:`JobDatabaseInterface`
+    :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface`
     """
 
     def __init__(
@@ -56,10 +56,10 @@ def exists(self) -> bool:
 
     def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Normalize the given dataframe to be compatible with :py:class:`MultiBackendJobManager`
+        Normalize the given dataframe to be compatible with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager`
         by adding the default columns and setting the index.
         """
-        df = MultiBackendJobManager._normalize_df(df)
+        df = MultiBackendJobManager._column_requirements.normalize_df(df)
         # If the user doesn't specify the item_id column, we will use the index.
         if "item_id" not in df.columns:
             df = df.reset_index(names=["item_id"])
@@ -69,7 +69,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
         """
         Initialize the job database from a given dataframe,
         which will be first normalized to be compatible
-        with :py:class:`MultiBackendJobManager` usage.
+        with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager` usage.
 
         :param df: dataframe with some columns your ``start_job`` callable expects
         :param on_exists: what to do when the job database already exists (persisted on disk):
diff --git a/tests/extra/job_management/test_job_db.py b/tests/extra/job_management/test_job_db.py
index d1f6c5a8f..efd1a09e0 100644
--- a/tests/extra/job_management/test_job_db.py
+++ b/tests/extra/job_management/test_job_db.py
@@ -1,13 +1,6 @@
 import re
 
 import geopandas
-
-# TODO: can we avoid using httpretty?
-#   We need it for testing the resilience, which uses an HTTPadapter with Retry
-#   but requests-mock also uses an HTTPAdapter for the mocking and basically
-#   erases the HTTPAdapter we have set up.
-#   httpretty avoids this specific problem because it mocks at the socket level,
-#   But I would rather not have two dependencies with almost the same goal.
 import pandas as pd
 import pytest
 import shapely.geometry
diff --git a/tests/extra/job_management/test_manager.py b/tests/extra/job_management/test_manager.py
index ffc06e049..ea5873f11 100644
--- a/tests/extra/job_management/test_manager.py
+++ b/tests/extra/job_management/test_manager.py
@@ -31,10 +31,7 @@
     ParquetJobDatabase,
     create_job_db,
 )
-from openeo.extra.job_management._manager import (
-    MAX_RETRIES,
-    MultiBackendJobManager,
-)
+from openeo.extra.job_management._manager import MAX_RETRIES, MultiBackendJobManager
 from openeo.extra.job_management._thread_worker import (
     Task,
     _JobManagerWorkerThreadPool,
@@ -296,7 +293,7 @@ def test_start_job_thread_basic(self, tmp_path, job_manager, job_manager_root_di
 
     def test_normalize_df(self):
         df = pd.DataFrame({"some_number": [3, 2, 1]})
-        df_normalized = MultiBackendJobManager._normalize_df(df)
+        df_normalized = MultiBackendJobManager._column_requirements.normalize_df(df)
         assert set(df_normalized.columns) == set(
             [
                 "some_number",
@@ -603,9 +600,7 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to not actually sleep, but skip one hour at a time
-        with mock.patch.object(
-            openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)
-        ):
+        with mock.patch("time.sleep", new=lambda s: time_machine.shift(60 * 60)):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()
@@ -724,9 +719,7 @@ def get_status(job_id, current_status):
         job_db_path = tmp_path / "jobs.csv"
 
         # Mock sleep() to skip one hour at a time instead of actually sleeping
-        with mock.patch.object(
-            openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)
-        ):
+        with mock.patch("time.sleep", new=lambda s: time_machine.shift(60 * 60)):
             job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
 
         final_df = CsvJobDatabase(job_db_path).read()