From 733e77a7c1345c3ef5223040ecc3241edda0fc16 Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Thu, 2 Oct 2025 15:15:16 +0200 Subject: [PATCH 01/10] moving code --- openeo/extra/job_management/__init__.py | 271 +----------------- .../extra/job_management/dataframe_job_db.py | 267 +++++++++++++++++ .../process_based_job_creator.py | 198 +++++++++++++ 3 files changed, 471 insertions(+), 265 deletions(-) create mode 100644 openeo/extra/job_management/dataframe_job_db.py create mode 100644 openeo/extra/job_management/process_based_job_creator.py diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py index 9dfef353c..61ada5db8 100644 --- a/openeo/extra/job_management/__init__.py +++ b/openeo/extra/job_management/__init__.py @@ -1,4 +1,3 @@ -import abc import collections import contextlib import dataclasses @@ -13,7 +12,6 @@ Any, Callable, Dict, - Iterable, List, Mapping, NamedTuple, @@ -24,9 +22,6 @@ import pandas as pd import requests -import shapely.errors -import shapely.geometry.base -import shapely.wkt from requests.adapters import HTTPAdapter from urllib3.util import Retry @@ -35,24 +30,22 @@ _JobManagerWorkerThreadPool, _JobStartTask, ) -from openeo.extra.job_management.process_based import ProcessBasedJobCreator +from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator +from openeo.extra.job_management.dataframe_job_db import JobDatabaseInterface, FullDataFrameJobDatabase, ParquetJobDatabase, CsvJobDatabase + from openeo.rest import OpenEoApiError from openeo.rest.auth.auth import BearerAuth -from openeo.util import deep_get, rfc3339 +from openeo.util import deep_get, rfc3339 + +_log = logging.getLogger(__name__) __all__ = [ "JobDatabaseInterface", "FullDataFrameJobDatabase", "ParquetJobDatabase", "CsvJobDatabase", - "ProcessBasedJobCreator", - "create_job_db", - "get_job_db", ] -_log = logging.getLogger(__name__) - - class _Backend(NamedTuple): """Container for backend info/settings""" @@ -68,67 +61,6 @@ class _Backend(NamedTuple): _UNSET = object() -class JobDatabaseInterface(metaclass=abc.ABCMeta): - """ - Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, - allowing to regularly persist the job metadata while polling the job statuses - and resume/restart the job tracking after it was interrupted. - - .. versionadded:: 0.31.0 - """ - - @abc.abstractmethod - def exists(self) -> bool: - """Does the job database already exist, to read job data from?""" - ... - - @abc.abstractmethod - def persist(self, df: pd.DataFrame): - """ - Store (now or updated) job data to the database. - - The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database, - so it should be merged with the existing data (if any) instead of overwriting it completely. - - :param df: job data to store. - """ - ... - - @abc.abstractmethod - def count_by_status(self, statuses: Iterable[str] = ()) -> dict: - """ - Retrieve the number of jobs per status. - - :param statuses: List/set of statuses to include. If empty, all statuses are included. - - :return: dictionary with status as key and the count as value. - """ - ... - - @abc.abstractmethod - def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame: - """ - Returns a dataframe with jobs, filtered by status. - - :param statuses: List of statuses to include. - :param max: Maximum number of jobs to return. - - :return: DataFrame with jobs filtered by status. - """ - ... - - @abc.abstractmethod - def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: - """ - Returns a dataframe with jobs based on their (dataframe) index - - :param indices: List of indices to include. - - :return: DataFrame with jobs filtered by indices. - """ - ... - - def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs): raise NotImplementedError("No 'start_job' callable provided") @@ -930,197 +862,6 @@ def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5): time.sleep(sleep) -class FullDataFrameJobDatabase(JobDatabaseInterface): - def __init__(self): - super().__init__() - self._df = None - - def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): - """ - Initialize the job database from a given dataframe, - which will be first normalized to be compatible - with :py:class:`MultiBackendJobManager` usage. - - :param df: dataframe with some columns your ``start_job`` callable expects - :param on_exists: what to do when the job database already exists (persisted on disk): - - "error": (default) raise an exception - - "skip": work with existing database, ignore given dataframe and skip any initialization - - :return: initialized job database. - - .. versionadded:: 0.33.0 - """ - # TODO: option to provide custom MultiBackendJobManager subclass with custom normalize? - if self.exists(): - if on_exists == "skip": - return self - elif on_exists == "error": - raise FileExistsError(f"Job database {self!r} already exists.") - else: - # TODO handle other on_exists modes: e.g. overwrite, merge, ... - raise ValueError(f"Invalid on_exists={on_exists!r}") - df = MultiBackendJobManager._normalize_df(df) - self.persist(df) - # Return self to allow chaining with constructor. - return self - - @abc.abstractmethod - def read(self) -> pd.DataFrame: - """ - Read job data from the database as pandas DataFrame. - - :return: loaded job data. - """ - ... - - @property - def df(self) -> pd.DataFrame: - if self._df is None: - self._df = self.read() - return self._df - - def count_by_status(self, statuses: Iterable[str] = ()) -> dict: - status_histogram = self.df.groupby("status").size().to_dict() - statuses = set(statuses) - if statuses: - status_histogram = {k: v for k, v in status_histogram.items() if k in statuses} - return status_histogram - - def get_by_status(self, statuses, max=None) -> pd.DataFrame: - """ - Returns a dataframe with jobs, filtered by status. - - :param statuses: List of statuses to include. - :param max: Maximum number of jobs to return. - - :return: DataFrame with jobs filtered by status. - """ - df = self.df - filtered = df[df.status.isin(statuses)] - return filtered.head(max) if max is not None else filtered - - def _merge_into_df(self, df: pd.DataFrame): - if self._df is not None: - unknown_indices = set(df.index).difference(df.index) - if unknown_indices: - _log.warning(f"Merging DataFrame with {unknown_indices=} which will be lost.") - self._df.update(df, overwrite=True) - else: - self._df = df - - def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: - indices = set(indices) - known = indices.intersection(self.df.index) - unknown = indices.difference(self.df.index) - if unknown: - _log.warning(f"Ignoring unknown DataFrame indices {unknown}") - return self._df.loc[list(known)] - - -class CsvJobDatabase(FullDataFrameJobDatabase): - """ - Persist/load job metadata with a CSV file. - - :implements: :py:class:`JobDatabaseInterface` - :param path: Path to local CSV file. - - .. note:: - Support for GeoPandas dataframes depends on the ``geopandas`` package - as :ref:`optional dependency `. - - .. versionadded:: 0.31.0 - """ - - def __init__(self, path: Union[str, Path]): - super().__init__() - self.path = Path(path) - - def __repr__(self): - return f"{self.__class__.__name__}({str(self.path)!r})" - - def exists(self) -> bool: - return self.path.exists() - - def _is_valid_wkt(self, wkt: str) -> bool: - try: - shapely.wkt.loads(wkt) - return True - except shapely.errors.WKTReadingError: - return False - - def read(self) -> pd.DataFrame: - df = pd.read_csv( - self.path, - # TODO: possible to avoid hidden coupling with MultiBackendJobManager here? - dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()}, - ) - if ( - "geometry" in df.columns - and df["geometry"].dtype.name != "geometry" - and self._is_valid_wkt(df["geometry"].iloc[0]) - ): - import geopandas - - # `df.to_csv()` in `persist()` has encoded geometries as WKT, so we decode that here. - df.geometry = geopandas.GeoSeries.from_wkt(df["geometry"]) - df = geopandas.GeoDataFrame(df) - return df - - def persist(self, df: pd.DataFrame): - self._merge_into_df(df) - self.path.parent.mkdir(parents=True, exist_ok=True) - self.df.to_csv(self.path, index=False) - - -class ParquetJobDatabase(FullDataFrameJobDatabase): - """ - Persist/load job metadata with a Parquet file. - - :implements: :py:class:`JobDatabaseInterface` - :param path: Path to the Parquet file. - - .. note:: - Support for Parquet files depends on the ``pyarrow`` package - as :ref:`optional dependency `. - - Support for GeoPandas dataframes depends on the ``geopandas`` package - as :ref:`optional dependency `. - - .. versionadded:: 0.31.0 - """ - - def __init__(self, path: Union[str, Path]): - super().__init__() - self.path = Path(path) - - def __repr__(self): - return f"{self.__class__.__name__}({str(self.path)!r})" - - def exists(self) -> bool: - return self.path.exists() - - def read(self) -> pd.DataFrame: - # Unfortunately, a naive `pandas.read_parquet()` does not easily allow - # reconstructing geometries from a GeoPandas Parquet file. - # And vice-versa, `geopandas.read_parquet()` does not support reading - # Parquet file without geometries. - # So we have to guess which case we have. - # TODO is there a cleaner way to do this? - import pyarrow.parquet - - metadata = pyarrow.parquet.read_metadata(self.path) - if b"geo" in metadata.metadata: - import geopandas - - return geopandas.read_parquet(self.path) - else: - return pd.read_parquet(self.path) - - def persist(self, df: pd.DataFrame): - self._merge_into_df(df) - self.path.parent.mkdir(parents=True, exist_ok=True) - self.df.to_parquet(self.path, index=False) - def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: """ diff --git a/openeo/extra/job_management/dataframe_job_db.py b/openeo/extra/job_management/dataframe_job_db.py new file mode 100644 index 000000000..39709cf50 --- /dev/null +++ b/openeo/extra/job_management/dataframe_job_db.py @@ -0,0 +1,267 @@ +import abc +import logging +from pathlib import Path +from typing import ( + Iterable, + List, + Union, +) + +import pandas as pd +import shapely.errors +import shapely.wkt + +_log = logging.getLogger(__name__) +from openeo.extra.job_management import MultiBackendJobManager + +class JobDatabaseInterface(metaclass=abc.ABCMeta): + """ + Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, + allowing to regularly persist the job metadata while polling the job statuses + and resume/restart the job tracking after it was interrupted. + + .. versionadded:: 0.31.0 + """ + + @abc.abstractmethod + def exists(self) -> bool: + """Does the job database already exist, to read job data from?""" + ... + + @abc.abstractmethod + def persist(self, df: pd.DataFrame): + """ + Store (now or updated) job data to the database. + + The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database, + so it should be merged with the existing data (if any) instead of overwriting it completely. + + :param df: job data to store. + """ + ... + + @abc.abstractmethod + def count_by_status(self, statuses: Iterable[str] = ()) -> dict: + """ + Retrieve the number of jobs per status. + + :param statuses: List/set of statuses to include. If empty, all statuses are included. + + :return: dictionary with status as key and the count as value. + """ + ... + + @abc.abstractmethod + def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame: + """ + Returns a dataframe with jobs, filtered by status. + + :param statuses: List of statuses to include. + :param max: Maximum number of jobs to return. + + :return: DataFrame with jobs filtered by status. + """ + ... + + @abc.abstractmethod + def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: + """ + Returns a dataframe with jobs based on their (dataframe) index + + :param indices: List of indices to include. + + :return: DataFrame with jobs filtered by indices. + """ + ... + +class FullDataFrameJobDatabase(JobDatabaseInterface): + def __init__(self): + super().__init__() + self._df = None + + def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): + """ + Initialize the job database from a given dataframe, + which will be first normalized to be compatible + with :py:class:`MultiBackendJobManager` usage. + + :param df: dataframe with some columns your ``start_job`` callable expects + :param on_exists: what to do when the job database already exists (persisted on disk): + - "error": (default) raise an exception + - "skip": work with existing database, ignore given dataframe and skip any initialization + + :return: initialized job database. + + .. versionadded:: 0.33.0 + """ + # TODO: option to provide custom MultiBackendJobManager subclass with custom normalize? + if self.exists(): + if on_exists == "skip": + return self + elif on_exists == "error": + raise FileExistsError(f"Job database {self!r} already exists.") + else: + # TODO handle other on_exists modes: e.g. overwrite, merge, ... + raise ValueError(f"Invalid on_exists={on_exists!r}") + df = MultiBackendJobManager._normalize_df(df) + self.persist(df) + # Return self to allow chaining with constructor. + return self + + @abc.abstractmethod + def read(self) -> pd.DataFrame: + """ + Read job data from the database as pandas DataFrame. + + :return: loaded job data. + """ + ... + + @property + def df(self) -> pd.DataFrame: + if self._df is None: + self._df = self.read() + return self._df + + def count_by_status(self, statuses: Iterable[str] = ()) -> dict: + status_histogram = self.df.groupby("status").size().to_dict() + statuses = set(statuses) + if statuses: + status_histogram = {k: v for k, v in status_histogram.items() if k in statuses} + return status_histogram + + def get_by_status(self, statuses, max=None) -> pd.DataFrame: + """ + Returns a dataframe with jobs, filtered by status. + + :param statuses: List of statuses to include. + :param max: Maximum number of jobs to return. + + :return: DataFrame with jobs filtered by status. + """ + df = self.df + filtered = df[df.status.isin(statuses)] + return filtered.head(max) if max is not None else filtered + + def _merge_into_df(self, df: pd.DataFrame): + if self._df is not None: + unknown_indices = set(df.index).difference(df.index) + if unknown_indices: + _log.warning(f"Merging DataFrame with {unknown_indices=} which will be lost.") + self._df.update(df, overwrite=True) + else: + self._df = df + + def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: + indices = set(indices) + known = indices.intersection(self.df.index) + unknown = indices.difference(self.df.index) + if unknown: + _log.warning(f"Ignoring unknown DataFrame indices {unknown}") + return self._df.loc[list(known)] + + +class CsvJobDatabase(FullDataFrameJobDatabase): + """ + Persist/load job metadata with a CSV file. + + :implements: :py:class:`JobDatabaseInterface` + :param path: Path to local CSV file. + + .. note:: + Support for GeoPandas dataframes depends on the ``geopandas`` package + as :ref:`optional dependency `. + + .. versionadded:: 0.31.0 + """ + + def __init__(self, path: Union[str, Path]): + super().__init__() + self.path = Path(path) + + def __repr__(self): + return f"{self.__class__.__name__}({str(self.path)!r})" + + def exists(self) -> bool: + return self.path.exists() + + def _is_valid_wkt(self, wkt: str) -> bool: + try: + shapely.wkt.loads(wkt) + return True + except shapely.errors.WKTReadingError: + return False + + def read(self) -> pd.DataFrame: + df = pd.read_csv( + self.path, + # TODO: possible to avoid hidden coupling with MultiBackendJobManager here? + dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()}, + ) + if ( + "geometry" in df.columns + and df["geometry"].dtype.name != "geometry" + and self._is_valid_wkt(df["geometry"].iloc[0]) + ): + import geopandas + + # `df.to_csv()` in `persist()` has encoded geometries as WKT, so we decode that here. + df.geometry = geopandas.GeoSeries.from_wkt(df["geometry"]) + df = geopandas.GeoDataFrame(df) + return df + + def persist(self, df: pd.DataFrame): + self._merge_into_df(df) + self.path.parent.mkdir(parents=True, exist_ok=True) + self.df.to_csv(self.path, index=False) + +class ParquetJobDatabase(FullDataFrameJobDatabase): + """ + Persist/load job metadata with a Parquet file. + + :implements: :py:class:`JobDatabaseInterface` + :param path: Path to the Parquet file. + + .. note:: + Support for Parquet files depends on the ``pyarrow`` package + as :ref:`optional dependency `. + + Support for GeoPandas dataframes depends on the ``geopandas`` package + as :ref:`optional dependency `. + + .. versionadded:: 0.31.0 + """ + + def __init__(self, path: Union[str, Path]): + super().__init__() + self.path = Path(path) + + def __repr__(self): + return f"{self.__class__.__name__}({str(self.path)!r})" + + def exists(self) -> bool: + return self.path.exists() + + def read(self) -> pd.DataFrame: + # Unfortunately, a naive `pandas.read_parquet()` does not easily allow + # reconstructing geometries from a GeoPandas Parquet file. + # And vice-versa, `geopandas.read_parquet()` does not support reading + # Parquet file without geometries. + # So we have to guess which case we have. + # TODO is there a cleaner way to do this? + import pyarrow.parquet + + metadata = pyarrow.parquet.read_metadata(self.path) + if b"geo" in metadata.metadata: + import geopandas + + return geopandas.read_parquet(self.path) + else: + return pd.read_parquet(self.path) + + def persist(self, df: pd.DataFrame): + self._merge_into_df(df) + self.path.parent.mkdir(parents=True, exist_ok=True) + self.df.to_parquet(self.path, index=False) + + diff --git a/openeo/extra/job_management/process_based_job_creator.py b/openeo/extra/job_management/process_based_job_creator.py new file mode 100644 index 000000000..a75531b6b --- /dev/null +++ b/openeo/extra/job_management/process_based_job_creator.py @@ -0,0 +1,198 @@ + +import logging +import re +from typing import ( + List, + Optional, + Union, +) +import numpy +import pandas as pd +import shapely.geometry.base + +from openeo import BatchJob, Connection +from openeo.internal.processes.parse import ( + Parameter, + Process, + parse_remote_process_definition, +) + +from openeo.util import LazyLoadCache, repr_truncate +_log = logging.getLogger(__name__) + + +class ProcessBasedJobCreator: + """ + Batch job creator + (to be used together with :py:class:`MultiBackendJobManager`) + that takes a parameterized openEO process definition + (e.g a user-defined process (UDP) or a remote openEO process definition), + and creates a batch job + for each row of the dataframe managed by the :py:class:`MultiBackendJobManager` + by filling in the process parameters with corresponding row values. + + .. seealso:: + See :ref:`job-management-with-process-based-job-creator` + for more information and examples. + + Process parameters are linked to dataframe columns by name. + While this intuitive name-based matching should cover most use cases, + there are additional options for overrides or fallbacks: + + - When provided, ``parameter_column_map`` will be consulted + for resolving a process parameter name (key in the dictionary) + to a desired dataframe column name (corresponding value). + - One common case is handled automatically as convenience functionality. + + When: + + - ``parameter_column_map`` is not provided (or set to ``None``), + - and there is a *single parameter* that accepts inline GeoJSON geometries, + - and the dataframe is a GeoPandas dataframe with a *single geometry* column, + + then this parameter and this geometries column will be linked automatically. + + - If a parameter can not be matched with a column by name as described above, + a default value will be picked, + first by looking in ``parameter_defaults`` (if provided), + and then by looking up the default value from the parameter schema in the process definition. + - Finally if no (default) value can be determined and the parameter + is not flagged as optional, an error will be raised. + + + :param process_id: (optional) openEO process identifier. + Can be omitted when working with a remote process definition + that is fully defined with a URL in the ``namespace`` parameter. + :param namespace: (optional) openEO process namespace. + Typically used to provide a URL to a remote process definition. + :param parameter_defaults: (optional) default values for process parameters, + to be used when not available in the dataframe managed by + :py:class:`MultiBackendJobManager`. + :param parameter_column_map: Optional overrides + for linking process parameters to dataframe columns: + mapping of process parameter names as key + to dataframe column names as value. + + .. versionadded:: 0.33.0 + + .. warning:: + This is an experimental API subject to change, + and we greatly welcome + `feedback and suggestions for improvement `_. + + """ + + def __init__( + self, + *, + process_id: Optional[str] = None, + namespace: Union[str, None] = None, + parameter_defaults: Optional[dict] = None, + parameter_column_map: Optional[dict] = None, + ): + if process_id is None and namespace is None: + raise ValueError("At least one of `process_id` and `namespace` should be provided.") + self._process_id = process_id + self._namespace = namespace + self._parameter_defaults = parameter_defaults or {} + self._parameter_column_map = parameter_column_map + self._cache = LazyLoadCache() + + def _get_process_definition(self, connection: Connection) -> Process: + if isinstance(self._namespace, str) and re.match("https?://", self._namespace): + # Remote process definition handling + return self._cache.get( + key=("remote_process_definition", self._namespace, self._process_id), + load=lambda: parse_remote_process_definition(namespace=self._namespace, process_id=self._process_id), + ) + elif self._namespace is None: + # Handling of a user-specific UDP + udp_raw = connection.user_defined_process(self._process_id).describe() + return Process.from_dict(udp_raw) + else: + raise NotImplementedError( + f"Unsupported process definition source udp_id={self._process_id!r} namespace={self._namespace!r}" + ) + + def start_job(self, row: pd.Series, connection: Connection, **_) -> BatchJob: + """ + Implementation of the ``start_job`` callable interface + of :py:meth:`MultiBackendJobManager.run_jobs` + to create a job based on given dataframe row + + :param row: The row in the pandas dataframe that stores the jobs state and other tracked data. + :param connection: The connection to the backend. + """ + # TODO: refactor out some methods, for better reuse and decoupling: + # `get_arguments()` (to build the arguments dictionary), `get_cube()` (to create the cube), + + process_definition = self._get_process_definition(connection=connection) + process_id = process_definition.id + parameters = process_definition.parameters or [] + + if self._parameter_column_map is None: + self._parameter_column_map = self._guess_parameter_column_map(parameters=parameters, row=row) + + arguments = {} + for parameter in parameters: + param_name = parameter.name + column_name = self._parameter_column_map.get(param_name, param_name) + if column_name in row.index: + # Get value from dataframe row + value = row.loc[column_name] + elif param_name in self._parameter_defaults: + # Fallback on default values from constructor + value = self._parameter_defaults[param_name] + elif parameter.has_default(): + # Explicitly use default value from parameter schema + value = parameter.default + elif parameter.optional: + # Skip optional parameters without any fallback default value + continue + else: + raise ValueError(f"Missing required parameter {param_name!r} for process {process_id!r}") + + # Prepare some values/dtypes for JSON encoding + if isinstance(value, numpy.integer): + value = int(value) + elif isinstance(value, numpy.number): + value = float(value) + elif isinstance(value, shapely.geometry.base.BaseGeometry): + value = shapely.geometry.mapping(value) + + arguments[param_name] = value + + cube = connection.datacube_from_process(process_id=process_id, namespace=self._namespace, **arguments) + + title = row.get("title", f"Process {process_id!r} with {repr_truncate(arguments)}") + description = row.get("description", f"Process {process_id!r} (namespace {self._namespace}) with {arguments}") + job = connection.create_job(cube, title=title, description=description) + + return job + + def __call__(self, *arg, **kwargs) -> BatchJob: + """Syntactic sugar for calling :py:meth:`start_job`.""" + return self.start_job(*arg, **kwargs) + + @staticmethod + def _guess_parameter_column_map(parameters: List[Parameter], row: pd.Series) -> dict: + """ + Guess parameter-column mapping from given parameter list and dataframe row + """ + parameter_column_map = {} + # Geometry based mapping: try to automatically map geometry columns to geojson parameters + geojson_parameters = [p.name for p in parameters if p.schema.accepts_geojson()] + geometry_columns = [i for (i, v) in row.items() if isinstance(v, shapely.geometry.base.BaseGeometry)] + if geojson_parameters and geometry_columns: + if len(geojson_parameters) == 1 and len(geometry_columns) == 1: + # Most common case: one geometry parameter and one geometry column: can be mapped naively + parameter_column_map[geojson_parameters[0]] = geometry_columns[0] + elif all(p in geometry_columns for p in geojson_parameters): + # Each geometry param has geometry column with same name: easy to map + parameter_column_map.update((p, p) for p in geojson_parameters) + else: + raise RuntimeError( + f"Problem with mapping geometry columns ({geometry_columns}) to process parameters ({geojson_parameters})" + ) + _log.debug(f"Guessed parameter-column map: {parameter_column_map}") + return parameter_column_map From 2e5f7d32200687662218685b7e2b16c5c63383a6 Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Fri, 3 Oct 2025 14:37:25 +0200 Subject: [PATCH 02/10] remove circular dependency --- openeo/extra/job_management/__init__.py | 41 ++++------ .../extra/job_management/_dataframe_utils.py | 33 ++++++++ .../{dataframe_job_db.py => _job_database.py} | 77 +++++++++++++++++-- .../job_management/test_job_management.py | 2 +- 4 files changed, 120 insertions(+), 33 deletions(-) create mode 100644 openeo/extra/job_management/_dataframe_utils.py rename openeo/extra/job_management/{dataframe_job_db.py => _job_database.py} (76%) diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py index 61ada5db8..0a50dd955 100644 --- a/openeo/extra/job_management/__init__.py +++ b/openeo/extra/job_management/__init__.py @@ -13,7 +13,6 @@ Callable, Dict, List, - Mapping, NamedTuple, Optional, Tuple, @@ -31,7 +30,9 @@ _JobStartTask, ) from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator -from openeo.extra.job_management.dataframe_job_db import JobDatabaseInterface, FullDataFrameJobDatabase, ParquetJobDatabase, CsvJobDatabase +from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db +from openeo.extra.job_management._dataframe_utils import normalize_dataframe + from openeo.rest import OpenEoApiError from openeo.rest.auth.auth import BearerAuth @@ -44,6 +45,10 @@ "FullDataFrameJobDatabase", "ParquetJobDatabase", "CsvJobDatabase", + "ProcessBasedJobCreator", + "create_job_db", + "get_job_db", + ] class _Backend(NamedTuple): @@ -73,6 +78,8 @@ class _ColumnProperties: default: Any = None + + class MultiBackendJobManager: """ Tracker for multiple jobs on multiple backends. @@ -143,21 +150,7 @@ def start_job( # Expected columns in the job DB dataframes. # TODO: make this part of public API when settled? # TODO: move non official statuses to seperate column (not_started, queued_for_start) - _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = { - "id": _ColumnProperties(dtype="str"), - "backend_name": _ColumnProperties(dtype="str"), - "status": _ColumnProperties(dtype="str", default="not_started"), - # TODO: use proper date/time dtype instead of legacy str for start times? - "start_time": _ColumnProperties(dtype="str"), - "running_start_time": _ColumnProperties(dtype="str"), - # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager, - # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`. - # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit? - "cpu": _ColumnProperties(dtype="str"), - "memory": _ColumnProperties(dtype="str"), - "duration": _ColumnProperties(dtype="str"), - "costs": _ColumnProperties(dtype="float64"), - } + def __init__( self, @@ -260,17 +253,8 @@ def _make_resilient(connection): @classmethod def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame: - """ - Normalize given pandas dataframe (creating a new one): - ensure we have the required columns. - - :param df: The dataframe to normalize. - :return: a new dataframe that is normalized. - """ - new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns} - df = df.assign(**new_columns) - return df + return normalize_dataframe(df) def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface): """ @@ -863,6 +847,7 @@ def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5): +<<<<<<< HEAD def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: """ Factory to get a job database at a given path, @@ -902,3 +887,5 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = else: raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") return job_db +======= +>>>>>>> 4825bbcd (remove circular dependency) diff --git a/openeo/extra/job_management/_dataframe_utils.py b/openeo/extra/job_management/_dataframe_utils.py new file mode 100644 index 000000000..151390171 --- /dev/null +++ b/openeo/extra/job_management/_dataframe_utils.py @@ -0,0 +1,33 @@ +import pandas as pd + +class _ColumnProperties: + def __init__(self, dtype: str, default=None): + self.dtype = dtype + self.default = default + +# Expected columns in the job DB dataframes. +# TODO: make this part of public API when settled? +# TODO: move non official statuses to seperate column (not_started, queued_for_start) +COLUMN_REQUIREMENTS = { + "id": _ColumnProperties(dtype="str"), + "backend_name": _ColumnProperties(dtype="str"), + "status": _ColumnProperties(dtype="str", default="not_started"), + "start_time": _ColumnProperties(dtype="str"), + "running_start_time": _ColumnProperties(dtype="str"), + "cpu": _ColumnProperties(dtype="str"), + "memory": _ColumnProperties(dtype="str"), + "duration": _ColumnProperties(dtype="str"), + "costs": _ColumnProperties(dtype="float64"), +} + +def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Normalize given pandas dataframe (creating a new one): + ensure we have the required columns. + + :param df: The dataframe to normalize. + :return: a new dataframe that is normalized. + """ + new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns} + df = df.assign(**new_columns) + return df \ No newline at end of file diff --git a/openeo/extra/job_management/dataframe_job_db.py b/openeo/extra/job_management/_job_database.py similarity index 76% rename from openeo/extra/job_management/dataframe_job_db.py rename to openeo/extra/job_management/_job_database.py index 39709cf50..55c00a10f 100644 --- a/openeo/extra/job_management/dataframe_job_db.py +++ b/openeo/extra/job_management/_job_database.py @@ -3,16 +3,20 @@ from pathlib import Path from typing import ( Iterable, - List, Union, + List, ) + import pandas as pd import shapely.errors import shapely.wkt +from openeo.extra.job_management._dataframe_utils import normalize_dataframe, COLUMN_REQUIREMENTS + + + _log = logging.getLogger(__name__) -from openeo.extra.job_management import MultiBackendJobManager class JobDatabaseInterface(metaclass=abc.ABCMeta): """ @@ -74,6 +78,8 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: """ ... + + class FullDataFrameJobDatabase(JobDatabaseInterface): def __init__(self): super().__init__() @@ -103,7 +109,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): else: # TODO handle other on_exists modes: e.g. overwrite, merge, ... raise ValueError(f"Invalid on_exists={on_exists!r}") - df = MultiBackendJobManager._normalize_df(df) + df = normalize_dataframe(df) self.persist(df) # Return self to allow chaining with constructor. return self @@ -161,6 +167,7 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: return self._df.loc[list(known)] + class CsvJobDatabase(FullDataFrameJobDatabase): """ Persist/load job metadata with a CSV file. @@ -196,7 +203,7 @@ def read(self) -> pd.DataFrame: df = pd.read_csv( self.path, # TODO: possible to avoid hidden coupling with MultiBackendJobManager here? - dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()}, + dtype={c: r.dtype for (c, r) in COLUMN_REQUIREMENTS.items()}, ) if ( "geometry" in df.columns @@ -262,6 +269,66 @@ def read(self) -> pd.DataFrame: def persist(self, df: pd.DataFrame): self._merge_into_df(df) self.path.parent.mkdir(parents=True, exist_ok=True) - self.df.to_parquet(self.path, index=False) + self.df.to_parquet(self.path, index=False) + +def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): + """ + Factory to create a job database at given path, + initialized from a given dataframe, + and its database type guessed from filename extension. + + :param path: Path to the job database file. + :param df: DataFrame to store in the job database. + :param on_exists: What to do when the job database already exists: + - "error": (default) raise an exception + - "skip": work with existing database, ignore given dataframe and skip any initialization + .. versionadded:: 0.33.0 + """ + job_db = get_job_db(path) + if isinstance(job_db, FullDataFrameJobDatabase): + job_db.initialize_from_df(df=df, on_exists=on_exists) + else: + raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") + return job_db + +def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: + """ + Factory to get a job database at a given path, + guessing the database type from filename extension. + + :param path: path to job database file. + + .. versionadded:: 0.33.0 + """ + path = Path(path) + if path.suffix.lower() in {".csv"}: + job_db = CsvJobDatabase(path=path) + elif path.suffix.lower() in {".parquet", ".geoparquet"}: + job_db = ParquetJobDatabase(path=path) + else: + raise ValueError(f"Could not guess job database type from {path!r}") + return job_db + + +def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): + """ + Factory to create a job database at given path, + initialized from a given dataframe, + and its database type guessed from filename extension. + + :param path: Path to the job database file. + :param df: DataFrame to store in the job database. + :param on_exists: What to do when the job database already exists: + - "error": (default) raise an exception + - "skip": work with existing database, ignore given dataframe and skip any initialization + + .. versionadded:: 0.33.0 + """ + job_db = get_job_db(path) + if isinstance(job_db, FullDataFrameJobDatabase): + job_db.initialize_from_df(df=df, on_exists=on_exists) + else: + raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") + return job_db diff --git a/tests/extra/job_management/test_job_management.py b/tests/extra/job_management/test_job_management.py index fab901548..58901c05a 100644 --- a/tests/extra/job_management/test_job_management.py +++ b/tests/extra/job_management/test_job_management.py @@ -869,7 +869,7 @@ def test_refresh_bearer_token_before_start( dummy_backend_bar, job_manager_root_dir, sleep_mock, - requests_mock, + requests_mock ): client_id = "client123" From 0a6d03c6a600de345c3ae2e85a096f8bfc1e3c60 Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Fri, 3 Oct 2025 14:39:38 +0200 Subject: [PATCH 03/10] centralizing --- openeo/extra/job_management/__init__.py | 4 +- .../extra/job_management/_dataframe_utils.py | 33 ---------------- openeo/extra/job_management/_job_database.py | 38 +++++++++++++++++-- 3 files changed, 36 insertions(+), 39 deletions(-) delete mode 100644 openeo/extra/job_management/_dataframe_utils.py diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py index 0a50dd955..1660ae0b5 100644 --- a/openeo/extra/job_management/__init__.py +++ b/openeo/extra/job_management/__init__.py @@ -30,9 +30,7 @@ _JobStartTask, ) from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator -from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db -from openeo.extra.job_management._dataframe_utils import normalize_dataframe - +from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db, normalize_dataframe from openeo.rest import OpenEoApiError from openeo.rest.auth.auth import BearerAuth diff --git a/openeo/extra/job_management/_dataframe_utils.py b/openeo/extra/job_management/_dataframe_utils.py deleted file mode 100644 index 151390171..000000000 --- a/openeo/extra/job_management/_dataframe_utils.py +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd - -class _ColumnProperties: - def __init__(self, dtype: str, default=None): - self.dtype = dtype - self.default = default - -# Expected columns in the job DB dataframes. -# TODO: make this part of public API when settled? -# TODO: move non official statuses to seperate column (not_started, queued_for_start) -COLUMN_REQUIREMENTS = { - "id": _ColumnProperties(dtype="str"), - "backend_name": _ColumnProperties(dtype="str"), - "status": _ColumnProperties(dtype="str", default="not_started"), - "start_time": _ColumnProperties(dtype="str"), - "running_start_time": _ColumnProperties(dtype="str"), - "cpu": _ColumnProperties(dtype="str"), - "memory": _ColumnProperties(dtype="str"), - "duration": _ColumnProperties(dtype="str"), - "costs": _ColumnProperties(dtype="float64"), -} - -def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame: - """ - Normalize given pandas dataframe (creating a new one): - ensure we have the required columns. - - :param df: The dataframe to normalize. - :return: a new dataframe that is normalized. - """ - new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns} - df = df.assign(**new_columns) - return df \ No newline at end of file diff --git a/openeo/extra/job_management/_job_database.py b/openeo/extra/job_management/_job_database.py index 55c00a10f..4be3fc1eb 100644 --- a/openeo/extra/job_management/_job_database.py +++ b/openeo/extra/job_management/_job_database.py @@ -12,12 +12,19 @@ import shapely.errors import shapely.wkt -from openeo.extra.job_management._dataframe_utils import normalize_dataframe, COLUMN_REQUIREMENTS - _log = logging.getLogger(__name__) +import pandas as pd + +class _ColumnProperties: + def __init__(self, dtype: str, default=None): + self.dtype = dtype + self.default = default + + + class JobDatabaseInterface(metaclass=abc.ABCMeta): """ Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, @@ -78,7 +85,20 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: """ ... - +# Expected columns in the job DB dataframes. +# TODO: make this part of public API when settled? +# TODO: move non official statuses to seperate column (not_started, queued_for_start) +COLUMN_REQUIREMENTS = { + "id": _ColumnProperties(dtype="str"), + "backend_name": _ColumnProperties(dtype="str"), + "status": _ColumnProperties(dtype="str", default="not_started"), + "start_time": _ColumnProperties(dtype="str"), + "running_start_time": _ColumnProperties(dtype="str"), + "cpu": _ColumnProperties(dtype="str"), + "memory": _ColumnProperties(dtype="str"), + "duration": _ColumnProperties(dtype="str"), + "costs": _ColumnProperties(dtype="float64"), +} class FullDataFrameJobDatabase(JobDatabaseInterface): def __init__(self): @@ -271,6 +291,18 @@ def persist(self, df: pd.DataFrame): self.path.parent.mkdir(parents=True, exist_ok=True) self.df.to_parquet(self.path, index=False) +def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Normalize given pandas dataframe (creating a new one): + ensure we have the required columns. + + :param df: The dataframe to normalize. + :return: a new dataframe that is normalized. + """ + new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns} + df = df.assign(**new_columns) + return df + def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): """ Factory to create a job database at given path, From 57458a39b05c91d4b54508b1958efd5de3f8466b Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Tue, 4 Nov 2025 13:23:22 +0100 Subject: [PATCH 04/10] rebase --- openeo/extra/job_management/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py index 1660ae0b5..1f70cb686 100644 --- a/openeo/extra/job_management/__init__.py +++ b/openeo/extra/job_management/__init__.py @@ -845,7 +845,6 @@ def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5): -<<<<<<< HEAD def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: """ Factory to get a job database at a given path, @@ -885,5 +884,3 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = else: raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") return job_db -======= ->>>>>>> 4825bbcd (remove circular dependency) From c2f1ad1b587049ef109403cca37cc7e5521b1327 Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Tue, 4 Nov 2025 13:27:31 +0100 Subject: [PATCH 05/10] duplication --- .../process_based_job_creator.py | 198 ------------------ 1 file changed, 198 deletions(-) delete mode 100644 openeo/extra/job_management/process_based_job_creator.py diff --git a/openeo/extra/job_management/process_based_job_creator.py b/openeo/extra/job_management/process_based_job_creator.py deleted file mode 100644 index a75531b6b..000000000 --- a/openeo/extra/job_management/process_based_job_creator.py +++ /dev/null @@ -1,198 +0,0 @@ - -import logging -import re -from typing import ( - List, - Optional, - Union, -) -import numpy -import pandas as pd -import shapely.geometry.base - -from openeo import BatchJob, Connection -from openeo.internal.processes.parse import ( - Parameter, - Process, - parse_remote_process_definition, -) - -from openeo.util import LazyLoadCache, repr_truncate -_log = logging.getLogger(__name__) - - -class ProcessBasedJobCreator: - """ - Batch job creator - (to be used together with :py:class:`MultiBackendJobManager`) - that takes a parameterized openEO process definition - (e.g a user-defined process (UDP) or a remote openEO process definition), - and creates a batch job - for each row of the dataframe managed by the :py:class:`MultiBackendJobManager` - by filling in the process parameters with corresponding row values. - - .. seealso:: - See :ref:`job-management-with-process-based-job-creator` - for more information and examples. - - Process parameters are linked to dataframe columns by name. - While this intuitive name-based matching should cover most use cases, - there are additional options for overrides or fallbacks: - - - When provided, ``parameter_column_map`` will be consulted - for resolving a process parameter name (key in the dictionary) - to a desired dataframe column name (corresponding value). - - One common case is handled automatically as convenience functionality. - - When: - - - ``parameter_column_map`` is not provided (or set to ``None``), - - and there is a *single parameter* that accepts inline GeoJSON geometries, - - and the dataframe is a GeoPandas dataframe with a *single geometry* column, - - then this parameter and this geometries column will be linked automatically. - - - If a parameter can not be matched with a column by name as described above, - a default value will be picked, - first by looking in ``parameter_defaults`` (if provided), - and then by looking up the default value from the parameter schema in the process definition. - - Finally if no (default) value can be determined and the parameter - is not flagged as optional, an error will be raised. - - - :param process_id: (optional) openEO process identifier. - Can be omitted when working with a remote process definition - that is fully defined with a URL in the ``namespace`` parameter. - :param namespace: (optional) openEO process namespace. - Typically used to provide a URL to a remote process definition. - :param parameter_defaults: (optional) default values for process parameters, - to be used when not available in the dataframe managed by - :py:class:`MultiBackendJobManager`. - :param parameter_column_map: Optional overrides - for linking process parameters to dataframe columns: - mapping of process parameter names as key - to dataframe column names as value. - - .. versionadded:: 0.33.0 - - .. warning:: - This is an experimental API subject to change, - and we greatly welcome - `feedback and suggestions for improvement `_. - - """ - - def __init__( - self, - *, - process_id: Optional[str] = None, - namespace: Union[str, None] = None, - parameter_defaults: Optional[dict] = None, - parameter_column_map: Optional[dict] = None, - ): - if process_id is None and namespace is None: - raise ValueError("At least one of `process_id` and `namespace` should be provided.") - self._process_id = process_id - self._namespace = namespace - self._parameter_defaults = parameter_defaults or {} - self._parameter_column_map = parameter_column_map - self._cache = LazyLoadCache() - - def _get_process_definition(self, connection: Connection) -> Process: - if isinstance(self._namespace, str) and re.match("https?://", self._namespace): - # Remote process definition handling - return self._cache.get( - key=("remote_process_definition", self._namespace, self._process_id), - load=lambda: parse_remote_process_definition(namespace=self._namespace, process_id=self._process_id), - ) - elif self._namespace is None: - # Handling of a user-specific UDP - udp_raw = connection.user_defined_process(self._process_id).describe() - return Process.from_dict(udp_raw) - else: - raise NotImplementedError( - f"Unsupported process definition source udp_id={self._process_id!r} namespace={self._namespace!r}" - ) - - def start_job(self, row: pd.Series, connection: Connection, **_) -> BatchJob: - """ - Implementation of the ``start_job`` callable interface - of :py:meth:`MultiBackendJobManager.run_jobs` - to create a job based on given dataframe row - - :param row: The row in the pandas dataframe that stores the jobs state and other tracked data. - :param connection: The connection to the backend. - """ - # TODO: refactor out some methods, for better reuse and decoupling: - # `get_arguments()` (to build the arguments dictionary), `get_cube()` (to create the cube), - - process_definition = self._get_process_definition(connection=connection) - process_id = process_definition.id - parameters = process_definition.parameters or [] - - if self._parameter_column_map is None: - self._parameter_column_map = self._guess_parameter_column_map(parameters=parameters, row=row) - - arguments = {} - for parameter in parameters: - param_name = parameter.name - column_name = self._parameter_column_map.get(param_name, param_name) - if column_name in row.index: - # Get value from dataframe row - value = row.loc[column_name] - elif param_name in self._parameter_defaults: - # Fallback on default values from constructor - value = self._parameter_defaults[param_name] - elif parameter.has_default(): - # Explicitly use default value from parameter schema - value = parameter.default - elif parameter.optional: - # Skip optional parameters without any fallback default value - continue - else: - raise ValueError(f"Missing required parameter {param_name!r} for process {process_id!r}") - - # Prepare some values/dtypes for JSON encoding - if isinstance(value, numpy.integer): - value = int(value) - elif isinstance(value, numpy.number): - value = float(value) - elif isinstance(value, shapely.geometry.base.BaseGeometry): - value = shapely.geometry.mapping(value) - - arguments[param_name] = value - - cube = connection.datacube_from_process(process_id=process_id, namespace=self._namespace, **arguments) - - title = row.get("title", f"Process {process_id!r} with {repr_truncate(arguments)}") - description = row.get("description", f"Process {process_id!r} (namespace {self._namespace}) with {arguments}") - job = connection.create_job(cube, title=title, description=description) - - return job - - def __call__(self, *arg, **kwargs) -> BatchJob: - """Syntactic sugar for calling :py:meth:`start_job`.""" - return self.start_job(*arg, **kwargs) - - @staticmethod - def _guess_parameter_column_map(parameters: List[Parameter], row: pd.Series) -> dict: - """ - Guess parameter-column mapping from given parameter list and dataframe row - """ - parameter_column_map = {} - # Geometry based mapping: try to automatically map geometry columns to geojson parameters - geojson_parameters = [p.name for p in parameters if p.schema.accepts_geojson()] - geometry_columns = [i for (i, v) in row.items() if isinstance(v, shapely.geometry.base.BaseGeometry)] - if geojson_parameters and geometry_columns: - if len(geojson_parameters) == 1 and len(geometry_columns) == 1: - # Most common case: one geometry parameter and one geometry column: can be mapped naively - parameter_column_map[geojson_parameters[0]] = geometry_columns[0] - elif all(p in geometry_columns for p in geojson_parameters): - # Each geometry param has geometry column with same name: easy to map - parameter_column_map.update((p, p) for p in geojson_parameters) - else: - raise RuntimeError( - f"Problem with mapping geometry columns ({geometry_columns}) to process parameters ({geojson_parameters})" - ) - _log.debug(f"Guessed parameter-column map: {parameter_column_map}") - return parameter_column_map From 5f3211bb8103dfcc788979dc6750b8fd739aceaf Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Wed, 5 Nov 2025 14:26:00 +0100 Subject: [PATCH 06/10] splitting --- openeo/extra/job_management/__init__.py | 879 +----------------- openeo/extra/job_management/_interface.py | 75 ++ .../{_job_database.py => _job_db.py} | 142 +-- openeo/extra/job_management/_manager.py | 853 +++++++++++++++++ 4 files changed, 943 insertions(+), 1006 deletions(-) create mode 100644 openeo/extra/job_management/_interface.py rename openeo/extra/job_management/{_job_database.py => _job_db.py} (65%) create mode 100644 openeo/extra/job_management/_manager.py diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py index 1f70cb686..dd42ac1a8 100644 --- a/openeo/extra/job_management/__init__.py +++ b/openeo/extra/job_management/__init__.py @@ -1,42 +1,7 @@ -import collections -import contextlib -import dataclasses -import datetime -import json -import logging -import time -import warnings -from pathlib import Path -from threading import Thread -from typing import ( - Any, - Callable, - Dict, - List, - NamedTuple, - Optional, - Tuple, - Union, -) - -import pandas as pd -import requests -from requests.adapters import HTTPAdapter -from urllib3.util import Retry - -from openeo import BatchJob, Connection -from openeo.extra.job_management._thread_worker import ( - _JobManagerWorkerThreadPool, - _JobStartTask, -) -from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator -from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db, normalize_dataframe - -from openeo.rest import OpenEoApiError -from openeo.rest.auth.auth import BearerAuth -from openeo.util import deep_get, rfc3339 - -_log = logging.getLogger(__name__) +from openeo.extra.job_management._interface import JobDatabaseInterface +from openeo.extra.job_management._job_db import FullDataFrameJobDatabase, CsvJobDatabase, ParquetJobDatabase, create_job_db, get_job_db +from openeo.extra.job_management.process_based import ProcessBasedJobCreator +from openeo.extra.job_management._manager import MultiBackendJobManager __all__ = [ "JobDatabaseInterface", @@ -46,841 +11,7 @@ "ProcessBasedJobCreator", "create_job_db", "get_job_db", - + "MultiBackendJobManager" ] -class _Backend(NamedTuple): - """Container for backend info/settings""" - - # callable to create a backend connection - get_connection: Callable[[], Connection] - # Maximum number of jobs to allow in parallel on a backend - parallel_jobs: int - - -MAX_RETRIES = 50 - -# Sentinel value to indicate that a parameter was not set -_UNSET = object() - - -def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs): - raise NotImplementedError("No 'start_job' callable provided") - - -@dataclasses.dataclass(frozen=True) -class _ColumnProperties: - """Expected/required properties of a column in the job manager related dataframes""" - - dtype: str = "object" - default: Any = None - - - - -class MultiBackendJobManager: - """ - Tracker for multiple jobs on multiple backends. - - Usage example: - - .. code-block:: python - - import logging - import pandas as pd - import openeo - from openeo.extra.job_management import MultiBackendJobManager - - logging.basicConfig( - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.INFO - ) - - manager = MultiBackendJobManager() - manager.add_backend("foo", connection=openeo.connect("http://foo.test")) - manager.add_backend("bar", connection=openeo.connect("http://bar.test")) - - jobs_df = pd.DataFrame(...) - output_file = "jobs.csv" - - def start_job( - row: pd.Series, - connection: openeo.Connection, - **kwargs - ) -> openeo.BatchJob: - year = row["year"] - cube = connection.load_collection( - ..., - temporal_extent=[f"{year}-01-01", f"{year+1}-01-01"], - ) - ... - return cube.create_job(...) - - manager.run_jobs(df=jobs_df, start_job=start_job, output_file=output_file) - - See :py:meth:`.run_jobs` for more information on the ``start_job`` callable. - - :param poll_sleep: - How many seconds to sleep between polls. - - :param root_dir: - Root directory to save files for the jobs, e.g. metadata and error logs. - This defaults to "." the current directory. - - Each job gets its own subfolder in this root directory. - You can use the following methods to find the relevant paths, - based on the job ID: - - - get_job_dir - - get_error_log_path - - get_job_metadata_path - - :param cancel_running_job_after: - Optional temporal limit (in seconds) after which running jobs should be canceled - by the job manager. - - .. versionadded:: 0.14.0 - - .. versionchanged:: 0.32.0 - Added ``cancel_running_job_after`` parameter. - """ - - # Expected columns in the job DB dataframes. - # TODO: make this part of public API when settled? - # TODO: move non official statuses to seperate column (not_started, queued_for_start) - - - def __init__( - self, - poll_sleep: int = 60, - root_dir: Optional[Union[str, Path]] = ".", - *, - cancel_running_job_after: Optional[int] = None, - ): - """Create a MultiBackendJobManager.""" - self._stop_thread = None - self.backends: Dict[str, _Backend] = {} - self.poll_sleep = poll_sleep - self._connections: Dict[str, _Backend] = {} - - # An explicit None or "" should also default to "." - self._root_dir = Path(root_dir or ".") - - self._cancel_running_job_after = ( - datetime.timedelta(seconds=cancel_running_job_after) if cancel_running_job_after is not None else None - ) - self._thread = None - self._worker_pool = None - # Generic cache - self._cache = {} - - def add_backend( - self, - name: str, - connection: Union[Connection, Callable[[], Connection]], - parallel_jobs: int = 2, - ): - """ - Register a backend with a name and a Connection getter. - - :param name: - Name of the backend. - :param connection: - Either a Connection to the backend, or a callable to create a backend connection. - :param parallel_jobs: - Maximum number of jobs to allow in parallel on a backend. - """ - - # TODO: Code might become simpler if we turn _Backend into class move this logic there. - # We would need to keep add_backend here as part of the public API though. - # But the amount of unrelated "stuff to manage" would be less (better cohesion) - if isinstance(connection, Connection): - c = connection - connection = lambda: c - assert callable(connection) - self.backends[name] = _Backend(get_connection=connection, parallel_jobs=parallel_jobs) - - def _get_connection(self, backend_name: str, resilient: bool = True) -> Connection: - """Get a connection for the backend and optionally make it resilient (adds retry behavior) - - The default is to get a resilient connection, but if necessary you can turn it off with - resilient=False - """ - - # TODO: Code could be simplified if _Backend is a class and this method is moved there. - # TODO: Is it better to make this a public method? - - # Reuse the connection if we can, in order to avoid modifying the same connection several times. - # This is to avoid adding the retry HTTPAdapter multiple times. - # Remember that the get_connection attribute on _Backend can be a Connection object instead - # of a callable, so we don't want to assume it is a fresh connection that doesn't have the - # retry adapter yet. - if backend_name in self._connections: - return self._connections[backend_name] - - connection = self.backends[backend_name].get_connection() - # If we really need it we can skip making it resilient, but by default it should be resilient. - if resilient: - self._make_resilient(connection) - - self._connections[backend_name] = connection - return connection - - @staticmethod - def _make_resilient(connection): - """Add an HTTPAdapter that retries the request if it fails. - - Retry for the following HTTP 50x statuses: - 502 Bad Gateway - 503 Service Unavailable - 504 Gateway Timeout - """ - # TODO: migrate this to now built-in retry configuration of `Connection` or `openeo.util.http.retry_adapter`? - status_forcelist = [500, 502, 503, 504] - retries = Retry( - total=MAX_RETRIES, - read=MAX_RETRIES, - other=MAX_RETRIES, - status=MAX_RETRIES, - backoff_factor=0.1, - status_forcelist=status_forcelist, - allowed_methods=["HEAD", "GET", "OPTIONS", "POST"], - ) - connection.session.mount("https://", HTTPAdapter(max_retries=retries)) - connection.session.mount("http://", HTTPAdapter(max_retries=retries)) - - @classmethod - def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame: - - return normalize_dataframe(df) - - def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface): - """ - Start running the jobs in a separate thread, returns afterwards. - - :param start_job: - A callback which will be invoked with, amongst others, - the row of the dataframe for which a job should be created and/or started. - This callable should return a :py:class:`openeo.rest.job.BatchJob` object. - - The following parameters will be passed to ``start_job``: - - ``row`` (:py:class:`pandas.Series`): - The row in the pandas dataframe that stores the jobs state and other tracked data. - - ``connection_provider``: - A getter to get a connection by backend name. - Typically, you would need either the parameter ``connection_provider``, - or the parameter ``connection``, but likely you will not need both. - - ``connection`` (:py:class:`Connection`): - The :py:class:`Connection` itself, that has already been created. - Typically, you would need either the parameter ``connection_provider``, - or the parameter ``connection``, but likely you will not need both. - - ``provider`` (``str``): - The name of the backend that will run the job. - - You do not have to define all the parameters described below, but if you leave - any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters. - Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``. - :param job_db: - Job database to load/store existing job status data and other metadata from/to. - Can be specified as a path to CSV or Parquet file, - or as a custom database object following the :py:class:`JobDatabaseInterface` interface. - - .. note:: - Support for Parquet files depends on the ``pyarrow`` package - as :ref:`optional dependency `. - - .. versionadded:: 0.32.0 - """ - - # Resume from existing db - _log.info(f"Resuming `run_jobs` from existing {job_db}") - - self._stop_thread = False - self._worker_pool = _JobManagerWorkerThreadPool() - - def run_loop(): - # TODO: support user-provided `stats` - stats = collections.defaultdict(int) - - while ( - sum( - job_db.count_by_status( - statuses=["not_started", "created", "queued", "queued_for_start", "running"] - ).values() - ) - > 0 - and not self._stop_thread - ): - self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats) - stats["run_jobs loop"] += 1 - - # Show current stats and sleep - _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}") - for _ in range(int(max(1, self.poll_sleep))): - time.sleep(1) - if self._stop_thread: - break - - self._thread = Thread(target=run_loop) - self._thread.start() - - def stop_job_thread(self, timeout_seconds: Optional[float] = _UNSET): - """ - Stop the job polling thread. - - :param timeout_seconds: The time to wait for the thread to stop. - By default, it will wait for 2 times the poll_sleep time. - Set to None to wait indefinitely. - - .. versionadded:: 0.32.0 - """ - self._worker_pool.shutdown() - - if self._thread is not None: - self._stop_thread = True - if timeout_seconds is _UNSET: - timeout_seconds = 2 * self.poll_sleep - self._thread.join(timeout_seconds) - if self._thread.is_alive(): - _log.warning("Job thread did not stop after timeout") - else: - _log.error("No job thread to stop") - - def run_jobs( - self, - df: Optional[pd.DataFrame] = None, - start_job: Callable[[], BatchJob] = _start_job_default, - job_db: Union[str, Path, JobDatabaseInterface, None] = None, - **kwargs, - ) -> dict: - """Runs jobs, specified in a dataframe, and tracks parameters. - - :param df: - DataFrame that specifies the jobs, and tracks the jobs' statuses. If None, the job_db has to be specified and will be used. - - :param start_job: - A callback which will be invoked with, amongst others, - the row of the dataframe for which a job should be created and/or started. - This callable should return a :py:class:`openeo.rest.job.BatchJob` object. - - The following parameters will be passed to ``start_job``: - - ``row`` (:py:class:`pandas.Series`): - The row in the pandas dataframe that stores the jobs state and other tracked data. - - ``connection_provider``: - A getter to get a connection by backend name. - Typically, you would need either the parameter ``connection_provider``, - or the parameter ``connection``, but likely you will not need both. - - ``connection`` (:py:class:`Connection`): - The :py:class:`Connection` itself, that has already been created. - Typically, you would need either the parameter ``connection_provider``, - or the parameter ``connection``, but likely you will not need both. - - ``provider`` (``str``): - The name of the backend that will run the job. - - You do not have to define all the parameters described below, but if you leave - any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters. - Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``. - - :param job_db: - Job database to load/store existing job status data and other metadata from/to. - Can be specified as a path to CSV or Parquet file, - or as a custom database object following the :py:class:`JobDatabaseInterface` interface. - - .. note:: - Support for Parquet files depends on the ``pyarrow`` package - as :ref:`optional dependency `. - - :return: dictionary with stats collected during the job running loop. - Note that the set of fields in this dictionary is experimental - and subject to change - - .. versionchanged:: 0.31.0 - Added support for persisting the job metadata in Parquet format. - - .. versionchanged:: 0.31.0 - Replace ``output_file`` argument with ``job_db`` argument, - which can be a path to a CSV or Parquet file, - or a user-defined :py:class:`JobDatabaseInterface` object. - The deprecated ``output_file`` argument is still supported for now. - - .. versionchanged:: 0.33.0 - return a stats dictionary - """ - # TODO Defining start_jobs as a Protocol might make its usage more clear, and avoid complicated docstrings, - - # Backwards compatibility for deprecated `output_file` argument - if "output_file" in kwargs: - if job_db is not None: - raise ValueError("Only one of `output_file` and `job_db` should be provided") - warnings.warn( - "The `output_file` argument is deprecated. Use `job_db` instead.", DeprecationWarning, stacklevel=2 - ) - job_db = kwargs.pop("output_file") - assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}" - - if isinstance(job_db, (str, Path)): - job_db = get_job_db(path=job_db) - - if not isinstance(job_db, JobDatabaseInterface): - raise ValueError(f"Unsupported job_db {job_db!r}") - - if job_db.exists(): - # Resume from existing db - _log.info(f"Resuming `run_jobs` from existing {job_db}") - elif df is not None: - # TODO: start showing deprecation warnings for this usage pattern? - job_db.initialize_from_df(df) - - # TODO: support user-provided `stats` - stats = collections.defaultdict(int) - - self._worker_pool = _JobManagerWorkerThreadPool() - - while ( - sum( - job_db.count_by_status( - statuses=["not_started", "created", "queued_for_start", "queued", "running"] - ).values() - ) - > 0 - ): - self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats) - stats["run_jobs loop"] += 1 - - # Show current stats and sleep - _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}") - time.sleep(self.poll_sleep) - stats["sleep"] += 1 - - # TODO; run post process after shutdown once more to ensure completion? - self._worker_pool.shutdown() - - return stats - - def _job_update_loop( - self, job_db: JobDatabaseInterface, start_job: Callable[[], BatchJob], stats: Optional[dict] = None - ): - """ - Inner loop logic of job management: - go through the necessary jobs to check for status updates, - trigger status events, start new jobs when there is room for them, etc. - """ - if not self.backends: - raise RuntimeError("No backends registered") - - stats = stats if stats is not None else collections.defaultdict(int) - - with ignore_connection_errors(context="get statuses"): - jobs_done, jobs_error, jobs_cancel = self._track_statuses(job_db, stats=stats) - stats["track_statuses"] += 1 - - not_started = job_db.get_by_status(statuses=["not_started"], max=200).copy() - if len(not_started) > 0: - # Check number of jobs running at each backend - # TODO: should "created" be included in here? Calling this "running" is quite misleading then. - running = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"]) - stats["job_db get_by_status"] += 1 - per_backend = running.groupby("backend_name").size().to_dict() - _log.info(f"Running per backend: {per_backend}") - total_added = 0 - for backend_name in self.backends: - backend_load = per_backend.get(backend_name, 0) - if backend_load < self.backends[backend_name].parallel_jobs: - to_add = self.backends[backend_name].parallel_jobs - backend_load - for i in not_started.index[total_added : total_added + to_add]: - self._launch_job(start_job, df=not_started, i=i, backend_name=backend_name, stats=stats) - stats["job launch"] += 1 - - job_db.persist(not_started.loc[i : i + 1]) - stats["job_db persist"] += 1 - total_added += 1 - - self._process_threadworker_updates(self._worker_pool, job_db=job_db, stats=stats) - - # TODO: move this back closer to the `_track_statuses` call above, once job done/error handling is also handled in threads? - for job, row in jobs_done: - self.on_job_done(job, row) - - for job, row in jobs_error: - self.on_job_error(job, row) - - for job, row in jobs_cancel: - self.on_job_cancel(job, row) - - def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = None): - """Helper method for launching jobs - - :param start_job: - A callback which will be invoked with the row of the dataframe for which a job should be started. - This callable should return a :py:class:`openeo.rest.job.BatchJob` object. - - See also: - `MultiBackendJobManager.run_jobs` for the parameters and return type of this callable - - Even though it is called here in `_launch_job` and that is where the constraints - really come from, the public method `run_jobs` needs to document `start_job` anyway, - so let's avoid duplication in the docstrings. - - :param df: - DataFrame that specifies the jobs, and tracks the jobs' statuses. - - :param i: - index of the job's row in dataframe df - - :param backend_name: - name of the backend that will execute the job. - """ - stats = stats if stats is not None else collections.defaultdict(int) - - df.loc[i, "backend_name"] = backend_name - row = df.loc[i] - try: - _log.info(f"Starting job on backend {backend_name} for {row.to_dict()}") - connection = self._get_connection(backend_name, resilient=True) - - stats["start_job call"] += 1 - job = start_job( - row=row, - connection_provider=self._get_connection, - connection=connection, - provider=backend_name, - ) - except requests.exceptions.ConnectionError as e: - _log.warning(f"Failed to start job for {row.to_dict()}", exc_info=True) - df.loc[i, "status"] = "start_failed" - stats["start_job error"] += 1 - else: - df.loc[i, "start_time"] = rfc3339.now_utc() - if job: - df.loc[i, "id"] = job.job_id - _log.info(f"Job created: {job.job_id}") - with ignore_connection_errors(context="get status"): - status = job.status() - stats["job get status"] += 1 - df.loc[i, "status"] = status - if status == "created": - # start job if not yet done by callback - try: - job_con = job.connection - # Proactively refresh bearer token (because task in thread will not be able to do that) - self._refresh_bearer_token(connection=job_con) - task = _JobStartTask( - root_url=job_con.root_url, - bearer_token=job_con.auth.bearer if isinstance(job_con.auth, BearerAuth) else None, - job_id=job.job_id, - df_idx=i, - ) - _log.info(f"Submitting task {task} to thread pool") - self._worker_pool.submit_task(task) - - stats["job_queued_for_start"] += 1 - df.loc[i, "status"] = "queued_for_start" - except OpenEoApiError as e: - _log.info(f"Failed submitting task {task} to thread pool with error: {e}") - df.loc[i, "status"] = "queued_for_start_failed" - stats["job queued for start failed"] += 1 - else: - # TODO: what is this "skipping" about actually? - df.loc[i, "status"] = "skipped" - stats["start_job skipped"] += 1 - - def _refresh_bearer_token(self, connection: Connection, *, max_age: float = 60) -> None: - """ - Helper to proactively refresh the bearer (access) token of the connection - (but not too often, based on `max_age`). - """ - # TODO: be smarter about timing, e.g. by inspecting expiry of current token? - now = time.time() - key = f"connection:{id(connection)}:refresh-time" - if self._cache.get(key, 0) + max_age < now: - refreshed = connection.try_access_token_refresh() - if refreshed: - self._cache[key] = now - else: - _log.warning("Failed to proactively refresh bearer token") - - def _process_threadworker_updates( - self, - worker_pool: _JobManagerWorkerThreadPool, - *, - job_db: JobDatabaseInterface, - stats: Dict[str, int], - ) -> None: - """ - Fetches completed TaskResult objects from the worker pool and applies - their db_update and stats_updates. Only existing DataFrame rows - (matched by df_idx) are upserted via job_db.persist(). Any results - targeting unknown df_idx indices are logged as errors but not persisted. - - :param worker_pool: Thread-pool managing asynchronous Task executes - :param job_db: Interface to append/upsert to the job database - :param stats: Dictionary accumulating statistic counters - """ - # Retrieve completed task results immediately - results, _ = worker_pool.process_futures(timeout=0) - - # Collect update dicts - updates: List[Dict[str, Any]] = [] - for res in results: - # Process database updates - if res.db_update: - try: - updates.append( - { - "id": res.job_id, - "df_idx": res.df_idx, - **res.db_update, - } - ) - except Exception as e: - _log.error(f"Skipping invalid db_update {res.db_update!r} for job {res.job_id!r}: {e}") - - # Process stats updates - if res.stats_update: - try: - for key, val in res.stats_update.items(): - count = int(val) - stats[key] = stats.get(key, 0) + count - except Exception as e: - _log.error(f"Skipping invalid stats_update {res.stats_update!r} for job {res.job_id!r}: {e}") - - # No valid updates: nothing to persist - if not updates: - return - - # Build update DataFrame and persist - df_updates = job_db.get_by_indices(indices=set(u["df_idx"] for u in updates)) - df_updates.update(pd.DataFrame(updates).set_index("df_idx", drop=True), overwrite=True) - job_db.persist(df_updates) - stats["job_db persist"] = stats.get("job_db persist", 0) + 1 - - def on_job_done(self, job: BatchJob, row): - """ - Handles jobs that have finished. Can be overridden to provide custom behaviour. - - Default implementation downloads the results into a folder containing the title. - - :param job: The job that has finished. - :param row: DataFrame row containing the job's metadata. - """ - # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use? - - job_metadata = job.describe() - job_dir = self.get_job_dir(job.job_id) - metadata_path = self.get_job_metadata_path(job.job_id) - - self.ensure_job_dir_exists(job.job_id) - job.get_results().download_files(target=job_dir) - - with metadata_path.open("w", encoding="utf-8") as f: - json.dump(job_metadata, f, ensure_ascii=False) - - def on_job_error(self, job: BatchJob, row): - """ - Handles jobs that stopped with errors. Can be overridden to provide custom behaviour. - - Default implementation writes the error logs to a JSON file. - - :param job: The job that has finished. - :param row: DataFrame row containing the job's metadata. - """ - # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use? - - error_logs = job.logs(level="error") - error_log_path = self.get_error_log_path(job.job_id) - - if len(error_logs) > 0: - self.ensure_job_dir_exists(job.job_id) - error_log_path.write_text(json.dumps(error_logs, indent=2)) - - def on_job_cancel(self, job: BatchJob, row): - """ - Handle a job that was cancelled. Can be overridden to provide custom behaviour. - - Default implementation does not do anything. - - :param job: The job that was canceled. - :param row: DataFrame row containing the job's metadata. - """ - pass - - def _cancel_prolonged_job(self, job: BatchJob, row): - """Cancel the job if it has been running for too long.""" - try: - # Ensure running start time is valid - job_running_start_time = rfc3339.parse_datetime(row.get("running_start_time"), with_timezone=True) - - # Parse the current time into a datetime object with timezone info - current_time = rfc3339.parse_datetime(rfc3339.now_utc(), with_timezone=True) - - # Calculate the elapsed time between job start and now - elapsed = current_time - job_running_start_time - - if elapsed > self._cancel_running_job_after: - _log.info( - f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {job_running_start_time})" - ) - job.stop() - - except Exception as e: - _log.error(f"Unexpected error while handling job {job.job_id}: {e}") - - def get_job_dir(self, job_id: str) -> Path: - """Path to directory where job metadata, results and error logs are be saved.""" - return self._root_dir / f"job_{job_id}" - - def get_error_log_path(self, job_id: str) -> Path: - """Path where error log file for the job is saved.""" - return self.get_job_dir(job_id) / f"job_{job_id}_errors.json" - - def get_job_metadata_path(self, job_id: str) -> Path: - """Path where job metadata file is saved.""" - return self.get_job_dir(job_id) / f"job_{job_id}.json" - - def ensure_job_dir_exists(self, job_id: str) -> Path: - """Create the job folder if it does not exist yet.""" - job_dir = self.get_job_dir(job_id) - if not job_dir.exists(): - job_dir.mkdir(parents=True) - - def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] = None) -> Tuple[List, List, List]: - """ - Tracks status (and stats) of running jobs (in place). - Optionally cancels jobs when running too long. - """ - stats = stats if stats is not None else collections.defaultdict(int) - - active = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"]).copy() - - jobs_done = [] - jobs_error = [] - jobs_cancel = [] - - for i in active.index: - job_id = active.loc[i, "id"] - backend_name = active.loc[i, "backend_name"] - previous_status = active.loc[i, "status"] - - try: - con = self._get_connection(backend_name) - the_job = con.job(job_id) - job_metadata = the_job.describe() - stats["job describe"] += 1 - new_status = job_metadata["status"] - - _log.info( - f"Status of job {job_id!r} (on backend {backend_name}) is {new_status!r} (previously {previous_status!r})" - ) - - if previous_status != "finished" and new_status == "finished": - stats["job finished"] += 1 - jobs_done.append((the_job, active.loc[i])) - - if previous_status != "error" and new_status == "error": - stats["job failed"] += 1 - jobs_error.append((the_job, active.loc[i])) - - if new_status == "canceled": - stats["job canceled"] += 1 - jobs_cancel.append((the_job, active.loc[i])) - - if previous_status in {"created", "queued", "queued_for_start"} and new_status == "running": - stats["job started running"] += 1 - active.loc[i, "running_start_time"] = rfc3339.now_utc() - - if self._cancel_running_job_after and new_status == "running": - if not active.loc[i, "running_start_time"] or pd.isna(active.loc[i, "running_start_time"]): - _log.warning( - f"Unknown 'running_start_time' for running job {job_id}. Using current time as an approximation." - ) - stats["job started running"] += 1 - active.loc[i, "running_start_time"] = rfc3339.now_utc() - - self._cancel_prolonged_job(the_job, active.loc[i]) - - active.loc[i, "status"] = new_status - - # TODO: there is well hidden coupling here with "cpu", "memory" and "duration" from `_normalize_df` - for key in job_metadata.get("usage", {}).keys(): - if key in active.columns: - active.loc[i, key] = _format_usage_stat(job_metadata, key) - if "costs" in job_metadata.keys(): - active.loc[i, "costs"] = job_metadata.get("costs") - - except OpenEoApiError as e: - # TODO: inspect status code and e.g. differentiate between 4xx/5xx - stats["job tracking error"] += 1 - _log.warning(f"Error while tracking status of job {job_id!r} on backend {backend_name}: {e!r}") - - stats["job_db persist"] += 1 - job_db.persist(active) - - return jobs_done, jobs_error, jobs_cancel - - -def _format_usage_stat(job_metadata: dict, field: str) -> str: - value = deep_get(job_metadata, "usage", field, "value", default=0) - unit = deep_get(job_metadata, "usage", field, "unit", default="") - return f"{value} {unit}".strip() - - -@contextlib.contextmanager -def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5): - """Context manager to ignore connection errors.""" - # TODO: move this out of this module and make it a more public utility? - try: - yield - except requests.exceptions.ConnectionError as e: - _log.warning(f"Ignoring connection error (context {context or 'n/a'}): {e}") - # Back off a bit - time.sleep(sleep) - - - -def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: - """ - Factory to get a job database at a given path, - guessing the database type from filename extension. - - :param path: path to job database file. - - .. versionadded:: 0.33.0 - """ - path = Path(path) - if path.suffix.lower() in {".csv"}: - job_db = CsvJobDatabase(path=path) - elif path.suffix.lower() in {".parquet", ".geoparquet"}: - job_db = ParquetJobDatabase(path=path) - else: - raise ValueError(f"Could not guess job database type from {path!r}") - return job_db - - -def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): - """ - Factory to create a job database at given path, - initialized from a given dataframe, - and its database type guessed from filename extension. - - :param path: Path to the job database file. - :param df: DataFrame to store in the job database. - :param on_exists: What to do when the job database already exists: - - "error": (default) raise an exception - - "skip": work with existing database, ignore given dataframe and skip any initialization - .. versionadded:: 0.33.0 - """ - job_db = get_job_db(path) - if isinstance(job_db, FullDataFrameJobDatabase): - job_db.initialize_from_df(df=df, on_exists=on_exists) - else: - raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") - return job_db diff --git a/openeo/extra/job_management/_interface.py b/openeo/extra/job_management/_interface.py new file mode 100644 index 000000000..32f6c9d0b --- /dev/null +++ b/openeo/extra/job_management/_interface.py @@ -0,0 +1,75 @@ +import abc +from typing import ( + Callable, + Iterable, + List, + NamedTuple, + Union, +) + +import pandas as pd + +class JobDatabaseInterface(metaclass=abc.ABCMeta): + """ + Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, + allowing to regularly persist the job metadata while polling the job statuses + and resume/restart the job tracking after it was interrupted. + + .. versionadded:: 0.31.0 + """ + + @abc.abstractmethod + def exists(self) -> bool: + """Does the job database already exist, to read job data from?""" + ... + + @abc.abstractmethod + def persist(self, df: pd.DataFrame): + """ + Store (now or updated) job data to the database. + + The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database, + so it should be merged with the existing data (if any) instead of overwriting it completely. + + :param df: job data to store. + """ + ... + + @abc.abstractmethod + def count_by_status(self, statuses: Iterable[str] = ()) -> dict: + """ + Retrieve the number of jobs per status. + + :param statuses: List/set of statuses to include. If empty, all statuses are included. + + :return: dictionary with status as key and the count as value. + """ + ... + + @abc.abstractmethod + def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame: + """ + Returns a dataframe with jobs, filtered by status. + + :param statuses: List of statuses to include. + :param max: Maximum number of jobs to return. + + :return: DataFrame with jobs filtered by status. + """ + ... + + @abc.abstractmethod + def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: + """ + Returns a dataframe with jobs based on their (dataframe) index + + :param indices: List of indices to include. + + :return: DataFrame with jobs filtered by indices. + """ + ... + + + + + diff --git a/openeo/extra/job_management/_job_database.py b/openeo/extra/job_management/_job_db.py similarity index 65% rename from openeo/extra/job_management/_job_database.py rename to openeo/extra/job_management/_job_db.py index 4be3fc1eb..6248074c9 100644 --- a/openeo/extra/job_management/_job_database.py +++ b/openeo/extra/job_management/_job_db.py @@ -1,104 +1,15 @@ import abc import logging -from pathlib import Path -from typing import ( - Iterable, - Union, - List, -) - - -import pandas as pd import shapely.errors import shapely.wkt - - - -_log = logging.getLogger(__name__) - +from pathlib import Path +from typing import Iterable, Union import pandas as pd -class _ColumnProperties: - def __init__(self, dtype: str, default=None): - self.dtype = dtype - self.default = default - +from openeo.extra.job_management._interface import JobDatabaseInterface +from openeo.extra.job_management._manager import MultiBackendJobManager - -class JobDatabaseInterface(metaclass=abc.ABCMeta): - """ - Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, - allowing to regularly persist the job metadata while polling the job statuses - and resume/restart the job tracking after it was interrupted. - - .. versionadded:: 0.31.0 - """ - - @abc.abstractmethod - def exists(self) -> bool: - """Does the job database already exist, to read job data from?""" - ... - - @abc.abstractmethod - def persist(self, df: pd.DataFrame): - """ - Store (now or updated) job data to the database. - - The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database, - so it should be merged with the existing data (if any) instead of overwriting it completely. - - :param df: job data to store. - """ - ... - - @abc.abstractmethod - def count_by_status(self, statuses: Iterable[str] = ()) -> dict: - """ - Retrieve the number of jobs per status. - - :param statuses: List/set of statuses to include. If empty, all statuses are included. - - :return: dictionary with status as key and the count as value. - """ - ... - - @abc.abstractmethod - def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame: - """ - Returns a dataframe with jobs, filtered by status. - - :param statuses: List of statuses to include. - :param max: Maximum number of jobs to return. - - :return: DataFrame with jobs filtered by status. - """ - ... - - @abc.abstractmethod - def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: - """ - Returns a dataframe with jobs based on their (dataframe) index - - :param indices: List of indices to include. - - :return: DataFrame with jobs filtered by indices. - """ - ... - -# Expected columns in the job DB dataframes. -# TODO: make this part of public API when settled? -# TODO: move non official statuses to seperate column (not_started, queued_for_start) -COLUMN_REQUIREMENTS = { - "id": _ColumnProperties(dtype="str"), - "backend_name": _ColumnProperties(dtype="str"), - "status": _ColumnProperties(dtype="str", default="not_started"), - "start_time": _ColumnProperties(dtype="str"), - "running_start_time": _ColumnProperties(dtype="str"), - "cpu": _ColumnProperties(dtype="str"), - "memory": _ColumnProperties(dtype="str"), - "duration": _ColumnProperties(dtype="str"), - "costs": _ColumnProperties(dtype="float64"), -} +_log = logging.getLogger(__name__) class FullDataFrameJobDatabase(JobDatabaseInterface): def __init__(self): @@ -129,7 +40,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): else: # TODO handle other on_exists modes: e.g. overwrite, merge, ... raise ValueError(f"Invalid on_exists={on_exists!r}") - df = normalize_dataframe(df) + df = MultiBackendJobManager._normalize_df(df) self.persist(df) # Return self to allow chaining with constructor. return self @@ -187,7 +98,6 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: return self._df.loc[list(known)] - class CsvJobDatabase(FullDataFrameJobDatabase): """ Persist/load job metadata with a CSV file. @@ -223,7 +133,7 @@ def read(self) -> pd.DataFrame: df = pd.read_csv( self.path, # TODO: possible to avoid hidden coupling with MultiBackendJobManager here? - dtype={c: r.dtype for (c, r) in COLUMN_REQUIREMENTS.items()}, + dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()}, ) if ( "geometry" in df.columns @@ -242,6 +152,7 @@ def persist(self, df: pd.DataFrame): self.path.parent.mkdir(parents=True, exist_ok=True) self.df.to_csv(self.path, index=False) + class ParquetJobDatabase(FullDataFrameJobDatabase): """ Persist/load job metadata with a Parquet file. @@ -289,40 +200,8 @@ def read(self) -> pd.DataFrame: def persist(self, df: pd.DataFrame): self._merge_into_df(df) self.path.parent.mkdir(parents=True, exist_ok=True) - self.df.to_parquet(self.path, index=False) - -def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame: - """ - Normalize given pandas dataframe (creating a new one): - ensure we have the required columns. - - :param df: The dataframe to normalize. - :return: a new dataframe that is normalized. - """ - new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns} - df = df.assign(**new_columns) - return df - -def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): - """ - Factory to create a job database at given path, - initialized from a given dataframe, - and its database type guessed from filename extension. + self.df.to_parquet(self.path, index=False) - :param path: Path to the job database file. - :param df: DataFrame to store in the job database. - :param on_exists: What to do when the job database already exists: - - "error": (default) raise an exception - - "skip": work with existing database, ignore given dataframe and skip any initialization - - .. versionadded:: 0.33.0 - """ - job_db = get_job_db(path) - if isinstance(job_db, FullDataFrameJobDatabase): - job_db.initialize_from_df(df=df, on_exists=on_exists) - else: - raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") - return job_db def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: """ @@ -362,5 +241,4 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = job_db.initialize_from_df(df=df, on_exists=on_exists) else: raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") - return job_db - + return job_db \ No newline at end of file diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py new file mode 100644 index 000000000..1c9dae4a8 --- /dev/null +++ b/openeo/extra/job_management/_manager.py @@ -0,0 +1,853 @@ +import collections +import contextlib +import dataclasses +import datetime +import json +import logging +import time +import warnings +from pathlib import Path +from threading import Thread +from typing import ( + Any, + Callable, + Dict, + List, + Mapping, + NamedTuple, + Optional, + Tuple, + Union, +) + +import pandas as pd +import requests +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +from openeo import BatchJob, Connection +from openeo.extra.job_management._thread_worker import ( + _JobManagerWorkerThreadPool, + _JobStartTask, +) +from openeo.extra.job_management._interface import JobDatabaseInterface + +from openeo.rest import OpenEoApiError +from openeo.rest.auth.auth import BearerAuth +from openeo.util import deep_get, rfc3339 + + +_log = logging.getLogger(__name__) + +MAX_RETRIES = 50 +# Sentinel value to indicate that a parameter was not set +_UNSET = object() + + +class _Backend(NamedTuple): + """Container for backend info/settings""" + + # callable to create a backend connection + get_connection: Callable[[], Connection] + # Maximum number of jobs to allow in parallel on a backend + parallel_jobs: int + +@dataclasses.dataclass(frozen=True) +class _ColumnProperties: + """Expected/required properties of a column in the job manager related dataframes""" + + dtype: str = "object" + default: Any = None + + + + +class MultiBackendJobManager: + """ + Tracker for multiple jobs on multiple backends. + + Usage example: + + .. code-block:: python + + import logging + import pandas as pd + import openeo + from openeo.extra.job_management import MultiBackendJobManager + + logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.INFO + ) + + manager = MultiBackendJobManager() + manager.add_backend("foo", connection=openeo.connect("http://foo.test")) + manager.add_backend("bar", connection=openeo.connect("http://bar.test")) + + jobs_df = pd.DataFrame(...) + output_file = "jobs.csv" + + def start_job( + row: pd.Series, + connection: openeo.Connection, + **kwargs + ) -> openeo.BatchJob: + year = row["year"] + cube = connection.load_collection( + ..., + temporal_extent=[f"{year}-01-01", f"{year+1}-01-01"], + ) + ... + return cube.create_job(...) + + manager.run_jobs(df=jobs_df, start_job=start_job, output_file=output_file) + + See :py:meth:`.run_jobs` for more information on the ``start_job`` callable. + + :param poll_sleep: + How many seconds to sleep between polls. + + :param root_dir: + Root directory to save files for the jobs, e.g. metadata and error logs. + This defaults to "." the current directory. + + Each job gets its own subfolder in this root directory. + You can use the following methods to find the relevant paths, + based on the job ID: + + - get_job_dir + - get_error_log_path + - get_job_metadata_path + + :param cancel_running_job_after: + Optional temporal limit (in seconds) after which running jobs should be canceled + by the job manager. + + .. versionadded:: 0.14.0 + + .. versionchanged:: 0.32.0 + Added ``cancel_running_job_after`` parameter. + """ + + # Expected columns in the job DB dataframes. + # TODO: make this part of public API when settled? + # TODO: move non official statuses to seperate column (not_started, queued_for_start) + _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = { + "id": _ColumnProperties(dtype="str"), + "backend_name": _ColumnProperties(dtype="str"), + "status": _ColumnProperties(dtype="str", default="not_started"), + # TODO: use proper date/time dtype instead of legacy str for start times? + "start_time": _ColumnProperties(dtype="str"), + "running_start_time": _ColumnProperties(dtype="str"), + # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager, + # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`. + # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit? + "cpu": _ColumnProperties(dtype="str"), + "memory": _ColumnProperties(dtype="str"), + "duration": _ColumnProperties(dtype="str"), + "costs": _ColumnProperties(dtype="float64"), + } + + def __init__( + self, + poll_sleep: int = 60, + root_dir: Optional[Union[str, Path]] = ".", + *, + cancel_running_job_after: Optional[int] = None, + ): + """Create a MultiBackendJobManager.""" + self._stop_thread = None + self.backends: Dict[str, _Backend] = {} + self.poll_sleep = poll_sleep + self._connections: Dict[str, _Backend] = {} + + # An explicit None or "" should also default to "." + self._root_dir = Path(root_dir or ".") + + self._cancel_running_job_after = ( + datetime.timedelta(seconds=cancel_running_job_after) if cancel_running_job_after is not None else None + ) + self._thread = None + self._worker_pool = None + # Generic cache + self._cache = {} + + def add_backend( + self, + name: str, + connection: Union[Connection, Callable[[], Connection]], + parallel_jobs: int = 2, + ): + """ + Register a backend with a name and a Connection getter. + + :param name: + Name of the backend. + :param connection: + Either a Connection to the backend, or a callable to create a backend connection. + :param parallel_jobs: + Maximum number of jobs to allow in parallel on a backend. + """ + + # TODO: Code might become simpler if we turn _Backend into class move this logic there. + # We would need to keep add_backend here as part of the public API though. + # But the amount of unrelated "stuff to manage" would be less (better cohesion) + if isinstance(connection, Connection): + c = connection + connection = lambda: c + assert callable(connection) + self.backends[name] = _Backend(get_connection=connection, parallel_jobs=parallel_jobs) + + def _get_connection(self, backend_name: str, resilient: bool = True) -> Connection: + """Get a connection for the backend and optionally make it resilient (adds retry behavior) + + The default is to get a resilient connection, but if necessary you can turn it off with + resilient=False + """ + + # TODO: Code could be simplified if _Backend is a class and this method is moved there. + # TODO: Is it better to make this a public method? + + # Reuse the connection if we can, in order to avoid modifying the same connection several times. + # This is to avoid adding the retry HTTPAdapter multiple times. + # Remember that the get_connection attribute on _Backend can be a Connection object instead + # of a callable, so we don't want to assume it is a fresh connection that doesn't have the + # retry adapter yet. + if backend_name in self._connections: + return self._connections[backend_name] + + connection = self.backends[backend_name].get_connection() + # If we really need it we can skip making it resilient, but by default it should be resilient. + if resilient: + self._make_resilient(connection) + + self._connections[backend_name] = connection + return connection + + @staticmethod + def _make_resilient(connection): + """Add an HTTPAdapter that retries the request if it fails. + + Retry for the following HTTP 50x statuses: + 502 Bad Gateway + 503 Service Unavailable + 504 Gateway Timeout + """ + # TODO: migrate this to now built-in retry configuration of `Connection` or `openeo.util.http.retry_adapter`? + status_forcelist = [500, 502, 503, 504] + retries = Retry( + total=MAX_RETRIES, + read=MAX_RETRIES, + other=MAX_RETRIES, + status=MAX_RETRIES, + backoff_factor=0.1, + status_forcelist=status_forcelist, + allowed_methods=["HEAD", "GET", "OPTIONS", "POST"], + ) + connection.session.mount("https://", HTTPAdapter(max_retries=retries)) + connection.session.mount("http://", HTTPAdapter(max_retries=retries)) + + @classmethod + def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame: + """ + Normalize given pandas dataframe (creating a new one): + ensure we have the required columns. + + :param df: The dataframe to normalize. + :return: a new dataframe that is normalized. + """ + new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns} + df = df.assign(**new_columns) + + return df + + def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface): + """ + Start running the jobs in a separate thread, returns afterwards. + + :param start_job: + A callback which will be invoked with, amongst others, + the row of the dataframe for which a job should be created and/or started. + This callable should return a :py:class:`openeo.rest.job.BatchJob` object. + + The following parameters will be passed to ``start_job``: + + ``row`` (:py:class:`pandas.Series`): + The row in the pandas dataframe that stores the jobs state and other tracked data. + + ``connection_provider``: + A getter to get a connection by backend name. + Typically, you would need either the parameter ``connection_provider``, + or the parameter ``connection``, but likely you will not need both. + + ``connection`` (:py:class:`Connection`): + The :py:class:`Connection` itself, that has already been created. + Typically, you would need either the parameter ``connection_provider``, + or the parameter ``connection``, but likely you will not need both. + + ``provider`` (``str``): + The name of the backend that will run the job. + + You do not have to define all the parameters described below, but if you leave + any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters. + Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``. + :param job_db: + Job database to load/store existing job status data and other metadata from/to. + Can be specified as a path to CSV or Parquet file, + or as a custom database object following the :py:class:`JobDatabaseInterface` interface. + + .. note:: + Support for Parquet files depends on the ``pyarrow`` package + as :ref:`optional dependency `. + + .. versionadded:: 0.32.0 + """ + + # Resume from existing db + _log.info(f"Resuming `run_jobs` from existing {job_db}") + + self._stop_thread = False + self._worker_pool = _JobManagerWorkerThreadPool() + + def run_loop(): + # TODO: support user-provided `stats` + stats = collections.defaultdict(int) + + while ( + sum( + job_db.count_by_status( + statuses=["not_started", "created", "queued", "queued_for_start", "running"] + ).values() + ) + > 0 + and not self._stop_thread + ): + self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats) + stats["run_jobs loop"] += 1 + + # Show current stats and sleep + _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}") + for _ in range(int(max(1, self.poll_sleep))): + time.sleep(1) + if self._stop_thread: + break + + self._thread = Thread(target=run_loop) + self._thread.start() + + def stop_job_thread(self, timeout_seconds: Optional[float] = _UNSET): + """ + Stop the job polling thread. + + :param timeout_seconds: The time to wait for the thread to stop. + By default, it will wait for 2 times the poll_sleep time. + Set to None to wait indefinitely. + + .. versionadded:: 0.32.0 + """ + self._worker_pool.shutdown() + + if self._thread is not None: + self._stop_thread = True + if timeout_seconds is _UNSET: + timeout_seconds = 2 * self.poll_sleep + self._thread.join(timeout_seconds) + if self._thread.is_alive(): + _log.warning("Job thread did not stop after timeout") + else: + _log.error("No job thread to stop") + + def run_jobs( + self, + df: Optional[pd.DataFrame] = None, + start_job: Callable[[], BatchJob] = _start_job_default, + job_db: Union[str, Path, JobDatabaseInterface, None] = None, + **kwargs, + ) -> dict: + """Runs jobs, specified in a dataframe, and tracks parameters. + + :param df: + DataFrame that specifies the jobs, and tracks the jobs' statuses. If None, the job_db has to be specified and will be used. + + :param start_job: + A callback which will be invoked with, amongst others, + the row of the dataframe for which a job should be created and/or started. + This callable should return a :py:class:`openeo.rest.job.BatchJob` object. + + The following parameters will be passed to ``start_job``: + + ``row`` (:py:class:`pandas.Series`): + The row in the pandas dataframe that stores the jobs state and other tracked data. + + ``connection_provider``: + A getter to get a connection by backend name. + Typically, you would need either the parameter ``connection_provider``, + or the parameter ``connection``, but likely you will not need both. + + ``connection`` (:py:class:`Connection`): + The :py:class:`Connection` itself, that has already been created. + Typically, you would need either the parameter ``connection_provider``, + or the parameter ``connection``, but likely you will not need both. + + ``provider`` (``str``): + The name of the backend that will run the job. + + You do not have to define all the parameters described below, but if you leave + any of them out, then remember to include the ``*args`` and ``**kwargs`` parameters. + Otherwise you will have an exception because :py:meth:`run_jobs` passes unknown parameters to ``start_job``. + + :param job_db: + Job database to load/store existing job status data and other metadata from/to. + Can be specified as a path to CSV or Parquet file, + or as a custom database object following the :py:class:`JobDatabaseInterface` interface. + + .. note:: + Support for Parquet files depends on the ``pyarrow`` package + as :ref:`optional dependency `. + + :return: dictionary with stats collected during the job running loop. + Note that the set of fields in this dictionary is experimental + and subject to change + + .. versionchanged:: 0.31.0 + Added support for persisting the job metadata in Parquet format. + + .. versionchanged:: 0.31.0 + Replace ``output_file`` argument with ``job_db`` argument, + which can be a path to a CSV or Parquet file, + or a user-defined :py:class:`JobDatabaseInterface` object. + The deprecated ``output_file`` argument is still supported for now. + + .. versionchanged:: 0.33.0 + return a stats dictionary + """ + # TODO Defining start_jobs as a Protocol might make its usage more clear, and avoid complicated docstrings, + + # Backwards compatibility for deprecated `output_file` argument + if "output_file" in kwargs: + if job_db is not None: + raise ValueError("Only one of `output_file` and `job_db` should be provided") + warnings.warn( + "The `output_file` argument is deprecated. Use `job_db` instead.", DeprecationWarning, stacklevel=2 + ) + job_db = kwargs.pop("output_file") + assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}" + + if isinstance(job_db, (str, Path)): + job_db = get_job_db(path=job_db) + + if not isinstance(job_db, JobDatabaseInterface): + raise ValueError(f"Unsupported job_db {job_db!r}") + + if job_db.exists(): + # Resume from existing db + _log.info(f"Resuming `run_jobs` from existing {job_db}") + elif df is not None: + # TODO: start showing deprecation warnings for this usage pattern? + job_db.initialize_from_df(df) + + # TODO: support user-provided `stats` + stats = collections.defaultdict(int) + + self._worker_pool = _JobManagerWorkerThreadPool() + + while ( + sum( + job_db.count_by_status( + statuses=["not_started", "created", "queued_for_start", "queued", "running"] + ).values() + ) + > 0 + ): + self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats) + stats["run_jobs loop"] += 1 + + # Show current stats and sleep + _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}") + time.sleep(self.poll_sleep) + stats["sleep"] += 1 + + # TODO; run post process after shutdown once more to ensure completion? + self._worker_pool.shutdown() + + return stats + + def _job_update_loop( + self, job_db: JobDatabaseInterface, start_job: Callable[[], BatchJob], stats: Optional[dict] = None + ): + """ + Inner loop logic of job management: + go through the necessary jobs to check for status updates, + trigger status events, start new jobs when there is room for them, etc. + """ + if not self.backends: + raise RuntimeError("No backends registered") + + stats = stats if stats is not None else collections.defaultdict(int) + + with ignore_connection_errors(context="get statuses"): + jobs_done, jobs_error, jobs_cancel = self._track_statuses(job_db, stats=stats) + stats["track_statuses"] += 1 + + not_started = job_db.get_by_status(statuses=["not_started"], max=200).copy() + if len(not_started) > 0: + # Check number of jobs running at each backend + # TODO: should "created" be included in here? Calling this "running" is quite misleading then. + running = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"]) + stats["job_db get_by_status"] += 1 + per_backend = running.groupby("backend_name").size().to_dict() + _log.info(f"Running per backend: {per_backend}") + total_added = 0 + for backend_name in self.backends: + backend_load = per_backend.get(backend_name, 0) + if backend_load < self.backends[backend_name].parallel_jobs: + to_add = self.backends[backend_name].parallel_jobs - backend_load + for i in not_started.index[total_added : total_added + to_add]: + self._launch_job(start_job, df=not_started, i=i, backend_name=backend_name, stats=stats) + stats["job launch"] += 1 + + job_db.persist(not_started.loc[i : i + 1]) + stats["job_db persist"] += 1 + total_added += 1 + + self._process_threadworker_updates(self._worker_pool, job_db=job_db, stats=stats) + + # TODO: move this back closer to the `_track_statuses` call above, once job done/error handling is also handled in threads? + for job, row in jobs_done: + self.on_job_done(job, row) + + for job, row in jobs_error: + self.on_job_error(job, row) + + for job, row in jobs_cancel: + self.on_job_cancel(job, row) + + def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = None): + """Helper method for launching jobs + + :param start_job: + A callback which will be invoked with the row of the dataframe for which a job should be started. + This callable should return a :py:class:`openeo.rest.job.BatchJob` object. + + See also: + `MultiBackendJobManager.run_jobs` for the parameters and return type of this callable + + Even though it is called here in `_launch_job` and that is where the constraints + really come from, the public method `run_jobs` needs to document `start_job` anyway, + so let's avoid duplication in the docstrings. + + :param df: + DataFrame that specifies the jobs, and tracks the jobs' statuses. + + :param i: + index of the job's row in dataframe df + + :param backend_name: + name of the backend that will execute the job. + """ + stats = stats if stats is not None else collections.defaultdict(int) + + df.loc[i, "backend_name"] = backend_name + row = df.loc[i] + try: + _log.info(f"Starting job on backend {backend_name} for {row.to_dict()}") + connection = self._get_connection(backend_name, resilient=True) + + stats["start_job call"] += 1 + job = start_job( + row=row, + connection_provider=self._get_connection, + connection=connection, + provider=backend_name, + ) + except requests.exceptions.ConnectionError as e: + _log.warning(f"Failed to start job for {row.to_dict()}", exc_info=True) + df.loc[i, "status"] = "start_failed" + stats["start_job error"] += 1 + else: + df.loc[i, "start_time"] = rfc3339.now_utc() + if job: + df.loc[i, "id"] = job.job_id + _log.info(f"Job created: {job.job_id}") + with ignore_connection_errors(context="get status"): + status = job.status() + stats["job get status"] += 1 + df.loc[i, "status"] = status + if status == "created": + # start job if not yet done by callback + try: + job_con = job.connection + # Proactively refresh bearer token (because task in thread will not be able to do that) + self._refresh_bearer_token(connection=job_con) + task = _JobStartTask( + root_url=job_con.root_url, + bearer_token=job_con.auth.bearer if isinstance(job_con.auth, BearerAuth) else None, + job_id=job.job_id, + df_idx=i, + ) + _log.info(f"Submitting task {task} to thread pool") + self._worker_pool.submit_task(task) + + stats["job_queued_for_start"] += 1 + df.loc[i, "status"] = "queued_for_start" + except OpenEoApiError as e: + _log.info(f"Failed submitting task {task} to thread pool with error: {e}") + df.loc[i, "status"] = "queued_for_start_failed" + stats["job queued for start failed"] += 1 + else: + # TODO: what is this "skipping" about actually? + df.loc[i, "status"] = "skipped" + stats["start_job skipped"] += 1 + + def _refresh_bearer_token(self, connection: Connection, *, max_age: float = 60) -> None: + """ + Helper to proactively refresh the bearer (access) token of the connection + (but not too often, based on `max_age`). + """ + # TODO: be smarter about timing, e.g. by inspecting expiry of current token? + now = time.time() + key = f"connection:{id(connection)}:refresh-time" + if self._cache.get(key, 0) + max_age < now: + refreshed = connection.try_access_token_refresh() + if refreshed: + self._cache[key] = now + else: + _log.warning("Failed to proactively refresh bearer token") + + def _process_threadworker_updates( + self, + worker_pool: _JobManagerWorkerThreadPool, + *, + job_db: JobDatabaseInterface, + stats: Dict[str, int], + ) -> None: + """ + Fetches completed TaskResult objects from the worker pool and applies + their db_update and stats_updates. Only existing DataFrame rows + (matched by df_idx) are upserted via job_db.persist(). Any results + targeting unknown df_idx indices are logged as errors but not persisted. + + :param worker_pool: Thread-pool managing asynchronous Task executes + :param job_db: Interface to append/upsert to the job database + :param stats: Dictionary accumulating statistic counters + """ + # Retrieve completed task results immediately + results, _ = worker_pool.process_futures(timeout=0) + + # Collect update dicts + updates: List[Dict[str, Any]] = [] + for res in results: + # Process database updates + if res.db_update: + try: + updates.append( + { + "id": res.job_id, + "df_idx": res.df_idx, + **res.db_update, + } + ) + except Exception as e: + _log.error(f"Skipping invalid db_update {res.db_update!r} for job {res.job_id!r}: {e}") + + # Process stats updates + if res.stats_update: + try: + for key, val in res.stats_update.items(): + count = int(val) + stats[key] = stats.get(key, 0) + count + except Exception as e: + _log.error(f"Skipping invalid stats_update {res.stats_update!r} for job {res.job_id!r}: {e}") + + # No valid updates: nothing to persist + if not updates: + return + + # Build update DataFrame and persist + df_updates = job_db.get_by_indices(indices=set(u["df_idx"] for u in updates)) + df_updates.update(pd.DataFrame(updates).set_index("df_idx", drop=True), overwrite=True) + job_db.persist(df_updates) + stats["job_db persist"] = stats.get("job_db persist", 0) + 1 + + def on_job_done(self, job: BatchJob, row): + """ + Handles jobs that have finished. Can be overridden to provide custom behaviour. + + Default implementation downloads the results into a folder containing the title. + + :param job: The job that has finished. + :param row: DataFrame row containing the job's metadata. + """ + # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use? + + job_metadata = job.describe() + job_dir = self.get_job_dir(job.job_id) + metadata_path = self.get_job_metadata_path(job.job_id) + + self.ensure_job_dir_exists(job.job_id) + job.get_results().download_files(target=job_dir) + + with metadata_path.open("w", encoding="utf-8") as f: + json.dump(job_metadata, f, ensure_ascii=False) + + def on_job_error(self, job: BatchJob, row): + """ + Handles jobs that stopped with errors. Can be overridden to provide custom behaviour. + + Default implementation writes the error logs to a JSON file. + + :param job: The job that has finished. + :param row: DataFrame row containing the job's metadata. + """ + # TODO: param `row` is never accessed in this method. Remove it? Is this intended for future use? + + error_logs = job.logs(level="error") + error_log_path = self.get_error_log_path(job.job_id) + + if len(error_logs) > 0: + self.ensure_job_dir_exists(job.job_id) + error_log_path.write_text(json.dumps(error_logs, indent=2)) + + def on_job_cancel(self, job: BatchJob, row): + """ + Handle a job that was cancelled. Can be overridden to provide custom behaviour. + + Default implementation does not do anything. + + :param job: The job that was canceled. + :param row: DataFrame row containing the job's metadata. + """ + pass + + def _cancel_prolonged_job(self, job: BatchJob, row): + """Cancel the job if it has been running for too long.""" + try: + # Ensure running start time is valid + job_running_start_time = rfc3339.parse_datetime(row.get("running_start_time"), with_timezone=True) + + # Parse the current time into a datetime object with timezone info + current_time = rfc3339.parse_datetime(rfc3339.now_utc(), with_timezone=True) + + # Calculate the elapsed time between job start and now + elapsed = current_time - job_running_start_time + + if elapsed > self._cancel_running_job_after: + _log.info( + f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {job_running_start_time})" + ) + job.stop() + + except Exception as e: + _log.error(f"Unexpected error while handling job {job.job_id}: {e}") + + def get_job_dir(self, job_id: str) -> Path: + """Path to directory where job metadata, results and error logs are be saved.""" + return self._root_dir / f"job_{job_id}" + + def get_error_log_path(self, job_id: str) -> Path: + """Path where error log file for the job is saved.""" + return self.get_job_dir(job_id) / f"job_{job_id}_errors.json" + + def get_job_metadata_path(self, job_id: str) -> Path: + """Path where job metadata file is saved.""" + return self.get_job_dir(job_id) / f"job_{job_id}.json" + + def ensure_job_dir_exists(self, job_id: str) -> Path: + """Create the job folder if it does not exist yet.""" + job_dir = self.get_job_dir(job_id) + if not job_dir.exists(): + job_dir.mkdir(parents=True) + + def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] = None) -> Tuple[List, List, List]: + """ + Tracks status (and stats) of running jobs (in place). + Optionally cancels jobs when running too long. + """ + stats = stats if stats is not None else collections.defaultdict(int) + + active = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"]).copy() + + jobs_done = [] + jobs_error = [] + jobs_cancel = [] + + for i in active.index: + job_id = active.loc[i, "id"] + backend_name = active.loc[i, "backend_name"] + previous_status = active.loc[i, "status"] + + try: + con = self._get_connection(backend_name) + the_job = con.job(job_id) + job_metadata = the_job.describe() + stats["job describe"] += 1 + new_status = job_metadata["status"] + + _log.info( + f"Status of job {job_id!r} (on backend {backend_name}) is {new_status!r} (previously {previous_status!r})" + ) + + if previous_status != "finished" and new_status == "finished": + stats["job finished"] += 1 + jobs_done.append((the_job, active.loc[i])) + + if previous_status != "error" and new_status == "error": + stats["job failed"] += 1 + jobs_error.append((the_job, active.loc[i])) + + if new_status == "canceled": + stats["job canceled"] += 1 + jobs_cancel.append((the_job, active.loc[i])) + + if previous_status in {"created", "queued", "queued_for_start"} and new_status == "running": + stats["job started running"] += 1 + active.loc[i, "running_start_time"] = rfc3339.now_utc() + + if self._cancel_running_job_after and new_status == "running": + if not active.loc[i, "running_start_time"] or pd.isna(active.loc[i, "running_start_time"]): + _log.warning( + f"Unknown 'running_start_time' for running job {job_id}. Using current time as an approximation." + ) + stats["job started running"] += 1 + active.loc[i, "running_start_time"] = rfc3339.now_utc() + + self._cancel_prolonged_job(the_job, active.loc[i]) + + active.loc[i, "status"] = new_status + + # TODO: there is well hidden coupling here with "cpu", "memory" and "duration" from `_normalize_df` + for key in job_metadata.get("usage", {}).keys(): + if key in active.columns: + active.loc[i, key] = _format_usage_stat(job_metadata, key) + if "costs" in job_metadata.keys(): + active.loc[i, "costs"] = job_metadata.get("costs") + + except OpenEoApiError as e: + # TODO: inspect status code and e.g. differentiate between 4xx/5xx + stats["job tracking error"] += 1 + _log.warning(f"Error while tracking status of job {job_id!r} on backend {backend_name}: {e!r}") + + stats["job_db persist"] += 1 + job_db.persist(active) + + return jobs_done, jobs_error, jobs_cancel + +def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs): + raise NotImplementedError("No 'start_job' callable provided") + +def _format_usage_stat(job_metadata: dict, field: str) -> str: + value = deep_get(job_metadata, "usage", field, "value", default=0) + unit = deep_get(job_metadata, "usage", field, "unit", default="") + return f"{value} {unit}".strip() + + +@contextlib.contextmanager +def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5): + """Context manager to ignore connection errors.""" + # TODO: move this out of this module and make it a more public utility? + try: + yield + except requests.exceptions.ConnectionError as e: + _log.warning(f"Ignoring connection error (context {context or 'n/a'}): {e}") + # Back off a bit + time.sleep(sleep) From f635528308233908f0654bf2f6e9eb19f0474e2e Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Wed, 5 Nov 2025 15:14:53 +0100 Subject: [PATCH 07/10] splitting up text and detecting circular import --- openeo/extra/job_management/_job_db.py | 39 -- openeo/extra/job_management/_manager.py | 7 +- tests/extra/job_management/test_job_db.py | 381 +++++++++++++++++ ...test_job_management.py => test_manager.py} | 383 +----------------- 4 files changed, 396 insertions(+), 414 deletions(-) create mode 100644 tests/extra/job_management/test_job_db.py rename tests/extra/job_management/{test_job_management.py => test_manager.py} (73%) diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py index 6248074c9..c6e3521fd 100644 --- a/openeo/extra/job_management/_job_db.py +++ b/openeo/extra/job_management/_job_db.py @@ -203,42 +203,3 @@ def persist(self, df: pd.DataFrame): self.df.to_parquet(self.path, index=False) -def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: - """ - Factory to get a job database at a given path, - guessing the database type from filename extension. - - :param path: path to job database file. - - .. versionadded:: 0.33.0 - """ - path = Path(path) - if path.suffix.lower() in {".csv"}: - job_db = CsvJobDatabase(path=path) - elif path.suffix.lower() in {".parquet", ".geoparquet"}: - job_db = ParquetJobDatabase(path=path) - else: - raise ValueError(f"Could not guess job database type from {path!r}") - return job_db - - -def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): - """ - Factory to create a job database at given path, - initialized from a given dataframe, - and its database type guessed from filename extension. - - :param path: Path to the job database file. - :param df: DataFrame to store in the job database. - :param on_exists: What to do when the job database already exists: - - "error": (default) raise an exception - - "skip": work with existing database, ignore given dataframe and skip any initialization - - .. versionadded:: 0.33.0 - """ - job_db = get_job_db(path) - if isinstance(job_db, FullDataFrameJobDatabase): - job_db.initialize_from_df(df=df, on_exists=on_exists) - else: - raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") - return job_db \ No newline at end of file diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py index 1c9dae4a8..febb15bae 100644 --- a/openeo/extra/job_management/_manager.py +++ b/openeo/extra/job_management/_manager.py @@ -31,6 +31,7 @@ _JobStartTask, ) from openeo.extra.job_management._interface import JobDatabaseInterface +#from openeo.extra.job_management._job_db import get_job_db #TODO circular import from openeo.rest import OpenEoApiError from openeo.rest.auth.auth import BearerAuth @@ -43,6 +44,8 @@ # Sentinel value to indicate that a parameter was not set _UNSET = object() +def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs): + raise NotImplementedError("No 'start_job' callable provided") class _Backend(NamedTuple): """Container for backend info/settings""" @@ -434,7 +437,7 @@ def run_jobs( assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}" if isinstance(job_db, (str, Path)): - job_db = get_job_db(path=job_db) + job_db = get_job_db(path=job_db) #TODO circular import if not isinstance(job_db, JobDatabaseInterface): raise ValueError(f"Unsupported job_db {job_db!r}") @@ -832,8 +835,6 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] = return jobs_done, jobs_error, jobs_cancel -def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs): - raise NotImplementedError("No 'start_job' callable provided") def _format_usage_stat(job_metadata: dict, field: str) -> str: value = deep_get(job_metadata, "usage", field, "value", default=0) diff --git a/tests/extra/job_management/test_job_db.py b/tests/extra/job_management/test_job_db.py new file mode 100644 index 000000000..53ccce8d5 --- /dev/null +++ b/tests/extra/job_management/test_job_db.py @@ -0,0 +1,381 @@ + +import re +import geopandas + +# TODO: can we avoid using httpretty? +# We need it for testing the resilience, which uses an HTTPadapter with Retry +# but requests-mock also uses an HTTPAdapter for the mocking and basically +# erases the HTTPAdapter we have set up. +# httpretty avoids this specific problem because it mocks at the socket level, +# But I would rather not have two dependencies with almost the same goal. +import pandas as pd +import pytest +import shapely.geometry + +from openeo.extra.job_management._job_db import ( + CsvJobDatabase, + ParquetJobDatabase, + create_job_db, + get_job_db, +) + +from openeo.utils.version import ComparableVersion + +JOB_DB_DF_BASICS = pd.DataFrame( + { + "numbers": [3, 2, 1], + "names": ["apple", "banana", "coconut"], + } +) +JOB_DB_GDF_WITH_GEOMETRY = geopandas.GeoDataFrame( + { + "numbers": [11, 22], + "geometry": [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 1)], + }, +) +JOB_DB_DF_WITH_GEOJSON_STRING = pd.DataFrame( + { + "numbers": [11, 22], + "geometry": ['{"type":"Point","coordinates":[1,2]}', '{"type":"Point","coordinates":[1,2]}'], + } +) + + +class TestFullDataFrameJobDatabase: + @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) + def test_initialize_from_df(self, tmp_path, db_class): + orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) + path = tmp_path / "jobs.db" + + db = db_class(path) + assert not path.exists() + db.initialize_from_df(orig_df) + assert path.exists() + + # Check persisted CSV + assert path.exists() + expected_columns = { + "some_number", + "status", + "id", + "start_time", + "running_start_time", + "cpu", + "memory", + "duration", + "backend_name", + "costs", + } + + actual_columns = set(db_class(path).read().columns) + assert actual_columns == expected_columns + + @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) + def test_initialize_from_df_on_exists_error(self, tmp_path, db_class): + df = pd.DataFrame({"some_number": [3, 2, 1]}) + path = tmp_path / "jobs.csv" + _ = db_class(path).initialize_from_df(df, on_exists="error") + assert path.exists() + + with pytest.raises(FileExistsError, match="Job database.* already exists"): + _ = db_class(path).initialize_from_df(df, on_exists="error") + + assert set(db_class(path).read()["some_number"]) == {1, 2, 3} + + @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) + def test_initialize_from_df_on_exists_skip(self, tmp_path, db_class): + path = tmp_path / "jobs.db" + + db = db_class(path).initialize_from_df( + pd.DataFrame({"some_number": [3, 2, 1]}), + on_exists="skip", + ) + assert set(db.read()["some_number"]) == {1, 2, 3} + + db = db_class(path).initialize_from_df( + pd.DataFrame({"some_number": [444, 555, 666]}), + on_exists="skip", + ) + assert set(db.read()["some_number"]) == {1, 2, 3} + + @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) + def test_count_by_status(self, tmp_path, db_class): + path = tmp_path / "jobs.db" + + db = db_class(path).initialize_from_df( + pd.DataFrame( + { + "status": [ + "not_started", + "created", + "queued", + "queued", + "queued", + "running", + "running", + "finished", + "finished", + "error", + ] + } + ) + ) + assert db.count_by_status(statuses=["not_started"]) == {"not_started": 1} + assert db.count_by_status(statuses=("not_started", "running")) == {"not_started": 1, "running": 2} + assert db.count_by_status(statuses={"finished", "error"}) == {"error": 1, "finished": 2} + + # All statuses by default + assert db.count_by_status() == { + "created": 1, + "error": 1, + "finished": 2, + "not_started": 1, + "queued": 3, + "running": 2, + } + + +class TestCsvJobDatabase: + def test_repr(self, tmp_path): + path = tmp_path / "db.csv" + db = CsvJobDatabase(path) + assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", repr(db)) + assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", str(db)) + + def test_read_wkt(self, tmp_path): + wkt_df = pd.DataFrame( + { + "value": ["wkt"], + "geometry": ["POINT (30 10)"], + } + ) + path = tmp_path / "jobs.csv" + wkt_df.to_csv(path, index=False) + df = CsvJobDatabase(path).read() + assert isinstance(df.geometry[0], shapely.geometry.Point) + + def test_read_non_wkt(self, tmp_path): + non_wkt_df = pd.DataFrame( + { + "value": ["non_wkt"], + "geometry": ["this is no WKT"], + } + ) + path = tmp_path / "jobs.csv" + non_wkt_df.to_csv(path, index=False) + df = CsvJobDatabase(path).read() + assert isinstance(df.geometry[0], str) + + @pytest.mark.parametrize( + ["orig"], + [ + pytest.param(JOB_DB_DF_BASICS, id="pandas basics"), + pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"), + pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"), + ], + ) + def test_persist_and_read(self, tmp_path, orig: pd.DataFrame): + path = tmp_path / "jobs.parquet" + CsvJobDatabase(path).persist(orig) + assert path.exists() + + loaded = CsvJobDatabase(path).read() + assert loaded.dtypes.to_dict() == orig.dtypes.to_dict() + assert loaded.equals(orig) + assert type(orig) is type(loaded) + + @pytest.mark.parametrize( + ["orig"], + [ + pytest.param(JOB_DB_DF_BASICS, id="pandas basics"), + pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"), + pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"), + ], + ) + def test_partial_read_write(self, tmp_path, orig: pd.DataFrame): + path = tmp_path / "jobs.csv" + + required_with_default = [ + ("status", "not_started"), + ("id", None), + ("start_time", None), + ] + new_columns = {col: val for (col, val) in required_with_default if col not in orig.columns} + orig = orig.assign(**new_columns) + + db = CsvJobDatabase(path) + db.persist(orig) + assert path.exists() + + loaded = db.get_by_status(statuses=["not_started"], max=2) + assert db.count_by_status(statuses=["not_started"])["not_started"] > 1 + + assert len(loaded) == 2 + loaded.loc[0, "status"] = "running" + loaded.loc[1, "status"] = "error" + db.persist(loaded) + assert db.count_by_status(statuses=["error"])["error"] == 1 + + all = db.read() + assert len(all) == len(orig) + assert all.loc[0, "status"] == "running" + assert all.loc[1, "status"] == "error" + if len(all) > 2: + assert all.loc[2, "status"] == "not_started" + print(loaded.index) + + def test_initialize_from_df(self, tmp_path): + orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) + path = tmp_path / "jobs.csv" + + # Initialize the CSV from the dataframe + _ = CsvJobDatabase(path).initialize_from_df(orig_df) + + # Check persisted CSV + assert path.exists() + expected_columns = { + "some_number", + "status", + "id", + "start_time", + "running_start_time", + "cpu", + "memory", + "duration", + "backend_name", + "costs", + } + + # Raw file content check + raw_columns = set(path.read_text().split("\n")[0].split(",")) + # Higher level read + read_columns = set(CsvJobDatabase(path).read().columns) + + assert raw_columns == expected_columns + assert read_columns == expected_columns + + def test_initialize_from_df_on_exists_error(self, tmp_path): + orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) + path = tmp_path / "jobs.csv" + _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error") + with pytest.raises(FileExistsError, match="Job database.* already exists"): + _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error") + + def test_initialize_from_df_on_exists_skip(self, tmp_path): + path = tmp_path / "jobs.csv" + + db = CsvJobDatabase(path).initialize_from_df( + pd.DataFrame({"some_number": [3, 2, 1]}), + on_exists="skip", + ) + assert set(db.read()["some_number"]) == {1, 2, 3} + + db = CsvJobDatabase(path).initialize_from_df( + pd.DataFrame({"some_number": [444, 555, 666]}), + on_exists="skip", + ) + assert set(db.read()["some_number"]) == {1, 2, 3} + + @pytest.mark.skipif( + ComparableVersion(geopandas.__version__) < "0.14", + reason="This issue has no workaround with geopandas < 0.14 (highest available version on Python 3.8 is 0.13.2)", + ) + def test_read_with_crs_column(self, tmp_path): + """ + Having a column named "crs" can cause obscure error messages when creating a GeoPandas dataframe + https://github.com/Open-EO/openeo-python-client/issues/714 + """ + source_df = pd.DataFrame( + { + "crs": [1234], + "geometry": ["Point(2 3)"], + } + ) + path = tmp_path / "jobs.csv" + source_df.to_csv(path, index=False) + result_df = CsvJobDatabase(path).read() + assert isinstance(result_df, geopandas.GeoDataFrame) + assert result_df.to_dict(orient="list") == { + "crs": [1234], + "geometry": [shapely.geometry.Point(2, 3)], + } + + +class TestParquetJobDatabase: + def test_repr(self, tmp_path): + path = tmp_path / "db.pq" + db = ParquetJobDatabase(path) + assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", repr(db)) + assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", str(db)) + + @pytest.mark.parametrize( + ["orig"], + [ + pytest.param(JOB_DB_DF_BASICS, id="pandas basics"), + pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"), + pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"), + ], + ) + def test_persist_and_read(self, tmp_path, orig: pd.DataFrame): + path = tmp_path / "jobs.parquet" + ParquetJobDatabase(path).persist(orig) + assert path.exists() + + loaded = ParquetJobDatabase(path).read() + assert loaded.dtypes.to_dict() == orig.dtypes.to_dict() + assert loaded.equals(orig) + assert type(orig) is type(loaded) + + def test_initialize_from_df(self, tmp_path): + orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) + path = tmp_path / "jobs.parquet" + + # Initialize the CSV from the dataframe + _ = ParquetJobDatabase(path).initialize_from_df(orig_df) + + # Check persisted CSV + assert path.exists() + expected_columns = { + "some_number", + "status", + "id", + "start_time", + "running_start_time", + "cpu", + "memory", + "duration", + "backend_name", + "costs", + } + + df_from_disk = ParquetJobDatabase(path).read() + assert set(df_from_disk.columns) == expected_columns + + +@pytest.mark.parametrize( + ["filename", "expected"], + [ + ("jobz.csv", CsvJobDatabase), + ("jobz.parquet", ParquetJobDatabase), + ], +) +def test_get_job_db(tmp_path, filename, expected): + path = tmp_path / filename + db = get_job_db(path) + assert isinstance(db, expected) + assert not path.exists() + + +@pytest.mark.parametrize( + ["filename", "expected"], + [ + ("jobz.csv", CsvJobDatabase), + ("jobz.parquet", ParquetJobDatabase), + ], +) +def test_create_job_db(tmp_path, filename, expected): + df = pd.DataFrame({"year": [2023, 2024]}) + path = tmp_path / filename + db = create_job_db(path=path, df=df) + assert isinstance(db, expected) + assert path.exists() \ No newline at end of file diff --git a/tests/extra/job_management/test_job_management.py b/tests/extra/job_management/test_manager.py similarity index 73% rename from tests/extra/job_management/test_job_management.py rename to tests/extra/job_management/test_manager.py index 58901c05a..58fcf2245 100644 --- a/tests/extra/job_management/test_job_management.py +++ b/tests/extra/job_management/test_manager.py @@ -11,7 +11,6 @@ from unittest import mock import dirty_equals -import geopandas # TODO: can we avoid using httpretty? # We need it for testing the resilience, which uses an HTTPadapter with Retry @@ -21,23 +20,21 @@ # But I would rather not have two dependencies with almost the same goal. import httpretty import numpy as np -import pandas import pandas as pd -import pandas.testing import pytest import requests -import shapely.geometry import openeo -import openeo.extra.job_management from openeo import BatchJob -from openeo.extra.job_management import ( +from openeo.extra.job_management._manager import ( MAX_RETRIES, - CsvJobDatabase, MultiBackendJobManager, +) + +from openeo.extra.job_management._job_db import ( + CsvJobDatabase, ParquetJobDatabase, create_job_db, - get_job_db, ) from openeo.extra.job_management._thread_worker import ( Task, @@ -47,7 +44,6 @@ from openeo.rest._testing import DummyBackend from openeo.rest.auth.testing import OidcMock from openeo.util import rfc3339 -from openeo.utils.version import ComparableVersion def _job_id_from_year(process_graph) -> Union[str, None]: @@ -608,7 +604,7 @@ def get_status(job_id, current_status): job_db_path = tmp_path / "jobs.csv" # Mock sleep() to not actually sleep, but skip one hour at a time - with mock.patch.object(openeo.extra.job_management.time, "sleep", new=lambda s: time_machine.shift(60 * 60)): + with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)): job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path) final_df = CsvJobDatabase(job_db_path).read() @@ -727,7 +723,7 @@ def get_status(job_id, current_status): job_db_path = tmp_path / "jobs.csv" # Mock sleep() to skip one hour at a time instead of actually sleeping - with mock.patch.object(openeo.extra.job_management.time, "sleep", new=lambda s: time_machine.shift(60 * 60)): + with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)): job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path) final_df = CsvJobDatabase(job_db_path).read() @@ -759,9 +755,9 @@ def test_process_threadworker_updates(self, tmp_path, caplog): mgr._process_threadworker_updates(worker_pool=pool, job_db=job_db, stats=stats) df_final = job_db.read() - pandas.testing.assert_frame_equal( + pd.testing.assert_frame_equal( df_final[["id", "status"]], - pandas.DataFrame( + pd.DataFrame( { "id": ["j-0", "j-1", "j-2", "j-3"], "status": ["queued", "queued", "created", "created"], @@ -796,9 +792,9 @@ def test_process_threadworker_updates_unknown(self, tmp_path, caplog): mgr._process_threadworker_updates(worker_pool=pool, job_db=job_db, stats=stats) df_final = job_db.read() - pandas.testing.assert_frame_equal( + pd.testing.assert_frame_equal( df_final[["id", "status"]], - pandas.DataFrame( + pd.DataFrame( { "id": ["j-123", "j-456"], "status": ["queued", "created"], @@ -908,361 +904,4 @@ def test_refresh_bearer_token_before_start( assert len(oidc_mock.grant_request_history) == 4 -JOB_DB_DF_BASICS = pd.DataFrame( - { - "numbers": [3, 2, 1], - "names": ["apple", "banana", "coconut"], - } -) -JOB_DB_GDF_WITH_GEOMETRY = geopandas.GeoDataFrame( - { - "numbers": [11, 22], - "geometry": [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 1)], - }, -) -JOB_DB_DF_WITH_GEOJSON_STRING = pd.DataFrame( - { - "numbers": [11, 22], - "geometry": ['{"type":"Point","coordinates":[1,2]}', '{"type":"Point","coordinates":[1,2]}'], - } -) - - -class TestFullDataFrameJobDatabase: - @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) - def test_initialize_from_df(self, tmp_path, db_class): - orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) - path = tmp_path / "jobs.db" - - db = db_class(path) - assert not path.exists() - db.initialize_from_df(orig_df) - assert path.exists() - - # Check persisted CSV - assert path.exists() - expected_columns = { - "some_number", - "status", - "id", - "start_time", - "running_start_time", - "cpu", - "memory", - "duration", - "backend_name", - "costs", - } - - actual_columns = set(db_class(path).read().columns) - assert actual_columns == expected_columns - - @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) - def test_initialize_from_df_on_exists_error(self, tmp_path, db_class): - df = pd.DataFrame({"some_number": [3, 2, 1]}) - path = tmp_path / "jobs.csv" - _ = db_class(path).initialize_from_df(df, on_exists="error") - assert path.exists() - - with pytest.raises(FileExistsError, match="Job database.* already exists"): - _ = db_class(path).initialize_from_df(df, on_exists="error") - - assert set(db_class(path).read()["some_number"]) == {1, 2, 3} - - @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) - def test_initialize_from_df_on_exists_skip(self, tmp_path, db_class): - path = tmp_path / "jobs.db" - - db = db_class(path).initialize_from_df( - pd.DataFrame({"some_number": [3, 2, 1]}), - on_exists="skip", - ) - assert set(db.read()["some_number"]) == {1, 2, 3} - - db = db_class(path).initialize_from_df( - pd.DataFrame({"some_number": [444, 555, 666]}), - on_exists="skip", - ) - assert set(db.read()["some_number"]) == {1, 2, 3} - - @pytest.mark.parametrize("db_class", [CsvJobDatabase, ParquetJobDatabase]) - def test_count_by_status(self, tmp_path, db_class): - path = tmp_path / "jobs.db" - - db = db_class(path).initialize_from_df( - pd.DataFrame( - { - "status": [ - "not_started", - "created", - "queued", - "queued", - "queued", - "running", - "running", - "finished", - "finished", - "error", - ] - } - ) - ) - assert db.count_by_status(statuses=["not_started"]) == {"not_started": 1} - assert db.count_by_status(statuses=("not_started", "running")) == {"not_started": 1, "running": 2} - assert db.count_by_status(statuses={"finished", "error"}) == {"error": 1, "finished": 2} - - # All statuses by default - assert db.count_by_status() == { - "created": 1, - "error": 1, - "finished": 2, - "not_started": 1, - "queued": 3, - "running": 2, - } - - -class TestCsvJobDatabase: - def test_repr(self, tmp_path): - path = tmp_path / "db.csv" - db = CsvJobDatabase(path) - assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", repr(db)) - assert re.match(r"CsvJobDatabase\('[^']+\.csv'\)", str(db)) - - def test_read_wkt(self, tmp_path): - wkt_df = pd.DataFrame( - { - "value": ["wkt"], - "geometry": ["POINT (30 10)"], - } - ) - path = tmp_path / "jobs.csv" - wkt_df.to_csv(path, index=False) - df = CsvJobDatabase(path).read() - assert isinstance(df.geometry[0], shapely.geometry.Point) - - def test_read_non_wkt(self, tmp_path): - non_wkt_df = pd.DataFrame( - { - "value": ["non_wkt"], - "geometry": ["this is no WKT"], - } - ) - path = tmp_path / "jobs.csv" - non_wkt_df.to_csv(path, index=False) - df = CsvJobDatabase(path).read() - assert isinstance(df.geometry[0], str) - - @pytest.mark.parametrize( - ["orig"], - [ - pytest.param(JOB_DB_DF_BASICS, id="pandas basics"), - pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"), - pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"), - ], - ) - def test_persist_and_read(self, tmp_path, orig: pandas.DataFrame): - path = tmp_path / "jobs.parquet" - CsvJobDatabase(path).persist(orig) - assert path.exists() - - loaded = CsvJobDatabase(path).read() - assert loaded.dtypes.to_dict() == orig.dtypes.to_dict() - assert loaded.equals(orig) - assert type(orig) is type(loaded) - - @pytest.mark.parametrize( - ["orig"], - [ - pytest.param(JOB_DB_DF_BASICS, id="pandas basics"), - pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"), - pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"), - ], - ) - def test_partial_read_write(self, tmp_path, orig: pandas.DataFrame): - path = tmp_path / "jobs.csv" - - required_with_default = [ - ("status", "not_started"), - ("id", None), - ("start_time", None), - ] - new_columns = {col: val for (col, val) in required_with_default if col not in orig.columns} - orig = orig.assign(**new_columns) - - db = CsvJobDatabase(path) - db.persist(orig) - assert path.exists() - - loaded = db.get_by_status(statuses=["not_started"], max=2) - assert db.count_by_status(statuses=["not_started"])["not_started"] > 1 - - assert len(loaded) == 2 - loaded.loc[0, "status"] = "running" - loaded.loc[1, "status"] = "error" - db.persist(loaded) - assert db.count_by_status(statuses=["error"])["error"] == 1 - - all = db.read() - assert len(all) == len(orig) - assert all.loc[0, "status"] == "running" - assert all.loc[1, "status"] == "error" - if len(all) > 2: - assert all.loc[2, "status"] == "not_started" - print(loaded.index) - - def test_initialize_from_df(self, tmp_path): - orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) - path = tmp_path / "jobs.csv" - - # Initialize the CSV from the dataframe - _ = CsvJobDatabase(path).initialize_from_df(orig_df) - - # Check persisted CSV - assert path.exists() - expected_columns = { - "some_number", - "status", - "id", - "start_time", - "running_start_time", - "cpu", - "memory", - "duration", - "backend_name", - "costs", - } - - # Raw file content check - raw_columns = set(path.read_text().split("\n")[0].split(",")) - # Higher level read - read_columns = set(CsvJobDatabase(path).read().columns) - - assert raw_columns == expected_columns - assert read_columns == expected_columns - def test_initialize_from_df_on_exists_error(self, tmp_path): - orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) - path = tmp_path / "jobs.csv" - _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error") - with pytest.raises(FileExistsError, match="Job database.* already exists"): - _ = CsvJobDatabase(path).initialize_from_df(orig_df, on_exists="error") - - def test_initialize_from_df_on_exists_skip(self, tmp_path): - path = tmp_path / "jobs.csv" - - db = CsvJobDatabase(path).initialize_from_df( - pd.DataFrame({"some_number": [3, 2, 1]}), - on_exists="skip", - ) - assert set(db.read()["some_number"]) == {1, 2, 3} - - db = CsvJobDatabase(path).initialize_from_df( - pd.DataFrame({"some_number": [444, 555, 666]}), - on_exists="skip", - ) - assert set(db.read()["some_number"]) == {1, 2, 3} - - @pytest.mark.skipif( - ComparableVersion(geopandas.__version__) < "0.14", - reason="This issue has no workaround with geopandas < 0.14 (highest available version on Python 3.8 is 0.13.2)", - ) - def test_read_with_crs_column(self, tmp_path): - """ - Having a column named "crs" can cause obscure error messages when creating a GeoPandas dataframe - https://github.com/Open-EO/openeo-python-client/issues/714 - """ - source_df = pd.DataFrame( - { - "crs": [1234], - "geometry": ["Point(2 3)"], - } - ) - path = tmp_path / "jobs.csv" - source_df.to_csv(path, index=False) - result_df = CsvJobDatabase(path).read() - assert isinstance(result_df, geopandas.GeoDataFrame) - assert result_df.to_dict(orient="list") == { - "crs": [1234], - "geometry": [shapely.geometry.Point(2, 3)], - } - - -class TestParquetJobDatabase: - def test_repr(self, tmp_path): - path = tmp_path / "db.pq" - db = ParquetJobDatabase(path) - assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", repr(db)) - assert re.match(r"ParquetJobDatabase\('[^']+\.pq'\)", str(db)) - - @pytest.mark.parametrize( - ["orig"], - [ - pytest.param(JOB_DB_DF_BASICS, id="pandas basics"), - pytest.param(JOB_DB_GDF_WITH_GEOMETRY, id="geopandas with geometry"), - pytest.param(JOB_DB_DF_WITH_GEOJSON_STRING, id="pandas with geojson string as geometry"), - ], - ) - def test_persist_and_read(self, tmp_path, orig: pandas.DataFrame): - path = tmp_path / "jobs.parquet" - ParquetJobDatabase(path).persist(orig) - assert path.exists() - - loaded = ParquetJobDatabase(path).read() - assert loaded.dtypes.to_dict() == orig.dtypes.to_dict() - assert loaded.equals(orig) - assert type(orig) is type(loaded) - - def test_initialize_from_df(self, tmp_path): - orig_df = pd.DataFrame({"some_number": [3, 2, 1]}) - path = tmp_path / "jobs.parquet" - - # Initialize the CSV from the dataframe - _ = ParquetJobDatabase(path).initialize_from_df(orig_df) - - # Check persisted CSV - assert path.exists() - expected_columns = { - "some_number", - "status", - "id", - "start_time", - "running_start_time", - "cpu", - "memory", - "duration", - "backend_name", - "costs", - } - - df_from_disk = ParquetJobDatabase(path).read() - assert set(df_from_disk.columns) == expected_columns - - -@pytest.mark.parametrize( - ["filename", "expected"], - [ - ("jobz.csv", CsvJobDatabase), - ("jobz.parquet", ParquetJobDatabase), - ], -) -def test_get_job_db(tmp_path, filename, expected): - path = tmp_path / filename - db = get_job_db(path) - assert isinstance(db, expected) - assert not path.exists() - - -@pytest.mark.parametrize( - ["filename", "expected"], - [ - ("jobz.csv", CsvJobDatabase), - ("jobz.parquet", ParquetJobDatabase), - ], -) -def test_create_job_db(tmp_path, filename, expected): - df = pd.DataFrame({"year": [2023, 2024]}) - path = tmp_path / filename - db = create_job_db(path=path, df=df) - assert isinstance(db, expected) - assert path.exists() From 7809ca52f10b5ccf135febff7cc5e1d5e269f400 Mon Sep 17 00:00:00 2001 From: Hans Vanrompay Date: Wed, 5 Nov 2025 15:52:54 +0100 Subject: [PATCH 08/10] avoid circular import --- openeo/extra/job_management/_df_schema.py | 42 +++++++++++++++++++++ openeo/extra/job_management/_job_db.py | 45 +++++++++++++++++++++-- openeo/extra/job_management/_manager.py | 34 ++--------------- 3 files changed, 87 insertions(+), 34 deletions(-) create mode 100644 openeo/extra/job_management/_df_schema.py diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py new file mode 100644 index 000000000..5140a5d04 --- /dev/null +++ b/openeo/extra/job_management/_df_schema.py @@ -0,0 +1,42 @@ +import dataclasses +from typing import Any, Mapping +import pandas as pd + +@dataclasses.dataclass(frozen=True) +class _ColumnProperties: + """Expected/required properties of a column in the job manager related dataframes""" + + dtype: str = "object" + default: Any = None + +# Expected columns in the job DB dataframes. +# TODO: make this part of public API when settled? +# TODO: move non official statuses to seperate column (not_started, queued_for_start) +_COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = { + "id": _ColumnProperties(dtype="str"), + "backend_name": _ColumnProperties(dtype="str"), + "status": _ColumnProperties(dtype="str", default="not_started"), + # TODO: use proper date/time dtype instead of legacy str for start times? + "start_time": _ColumnProperties(dtype="str"), + "running_start_time": _ColumnProperties(dtype="str"), + # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager, + # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`. + # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit? + "cpu": _ColumnProperties(dtype="str"), + "memory": _ColumnProperties(dtype="str"), + "duration": _ColumnProperties(dtype="str"), + "costs": _ColumnProperties(dtype="float64"), +} + +def _normalize(df: pd.DataFrame) -> pd.DataFrame: + """ + Normalize given pandas dataframe (creating a new one): + ensure we have the required columns. + + :param df: The dataframe to normalize. + :return: a new dataframe that is normalized. + """ + new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns} + df = df.assign(**new_columns) + + return df \ No newline at end of file diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py index c6e3521fd..b0bb43ab9 100644 --- a/openeo/extra/job_management/_job_db.py +++ b/openeo/extra/job_management/_job_db.py @@ -7,7 +7,7 @@ import pandas as pd from openeo.extra.job_management._interface import JobDatabaseInterface -from openeo.extra.job_management._manager import MultiBackendJobManager +from openeo.extra.job_management._df_schema import _normalize, _COLUMN_REQUIREMENTS _log = logging.getLogger(__name__) @@ -40,7 +40,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): else: # TODO handle other on_exists modes: e.g. overwrite, merge, ... raise ValueError(f"Invalid on_exists={on_exists!r}") - df = MultiBackendJobManager._normalize_df(df) + df = _normalize(df) self.persist(df) # Return self to allow chaining with constructor. return self @@ -133,7 +133,7 @@ def read(self) -> pd.DataFrame: df = pd.read_csv( self.path, # TODO: possible to avoid hidden coupling with MultiBackendJobManager here? - dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()}, + dtype={c: r.dtype for (c, r) in _COLUMN_REQUIREMENTS.items()}, ) if ( "geometry" in df.columns @@ -203,3 +203,42 @@ def persist(self, df: pd.DataFrame): self.df.to_parquet(self.path, index=False) +def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: + """ + Factory to get a job database at a given path, + guessing the database type from filename extension. + + :param path: path to job database file. + + .. versionadded:: 0.33.0 + """ + path = Path(path) + if path.suffix.lower() in {".csv"}: + job_db = CsvJobDatabase(path=path) + elif path.suffix.lower() in {".parquet", ".geoparquet"}: + job_db = ParquetJobDatabase(path=path) + else: + raise ValueError(f"Could not guess job database type from {path!r}") + return job_db + + +def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): + """ + Factory to create a job database at given path, + initialized from a given dataframe, + and its database type guessed from filename extension. + + :param path: Path to the job database file. + :param df: DataFrame to store in the job database. + :param on_exists: What to do when the job database already exists: + - "error": (default) raise an exception + - "skip": work with existing database, ignore given dataframe and skip any initialization + + .. versionadded:: 0.33.0 + """ + job_db = get_job_db(path) + if isinstance(job_db, FullDataFrameJobDatabase): + job_db.initialize_from_df(df=df, on_exists=on_exists) + else: + raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") + return job_db \ No newline at end of file diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py index febb15bae..265ab6277 100644 --- a/openeo/extra/job_management/_manager.py +++ b/openeo/extra/job_management/_manager.py @@ -13,7 +13,6 @@ Callable, Dict, List, - Mapping, NamedTuple, Optional, Tuple, @@ -31,7 +30,8 @@ _JobStartTask, ) from openeo.extra.job_management._interface import JobDatabaseInterface -#from openeo.extra.job_management._job_db import get_job_db #TODO circular import +from openeo.extra.job_management._job_db import get_job_db +from openeo.extra.job_management._df_schema import _normalize from openeo.rest import OpenEoApiError from openeo.rest.auth.auth import BearerAuth @@ -55,13 +55,6 @@ class _Backend(NamedTuple): # Maximum number of jobs to allow in parallel on a backend parallel_jobs: int -@dataclasses.dataclass(frozen=True) -class _ColumnProperties: - """Expected/required properties of a column in the job manager related dataframes""" - - dtype: str = "object" - default: Any = None - @@ -132,24 +125,6 @@ def start_job( Added ``cancel_running_job_after`` parameter. """ - # Expected columns in the job DB dataframes. - # TODO: make this part of public API when settled? - # TODO: move non official statuses to seperate column (not_started, queued_for_start) - _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = { - "id": _ColumnProperties(dtype="str"), - "backend_name": _ColumnProperties(dtype="str"), - "status": _ColumnProperties(dtype="str", default="not_started"), - # TODO: use proper date/time dtype instead of legacy str for start times? - "start_time": _ColumnProperties(dtype="str"), - "running_start_time": _ColumnProperties(dtype="str"), - # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager, - # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`. - # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit? - "cpu": _ColumnProperties(dtype="str"), - "memory": _ColumnProperties(dtype="str"), - "duration": _ColumnProperties(dtype="str"), - "costs": _ColumnProperties(dtype="float64"), - } def __init__( self, @@ -259,10 +234,7 @@ def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame: :param df: The dataframe to normalize. :return: a new dataframe that is normalized. """ - new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns} - df = df.assign(**new_columns) - - return df + return _normalize(df) def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface): """ From fa4f7538d60ede64e4f9c7860ea000bed8ab8632 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Wed, 5 Nov 2025 16:37:19 +0100 Subject: [PATCH 09/10] ruff based code/import cleanup #741/#815 --- openeo/extra/job_management/__init__.py | 14 ++++++++----- openeo/extra/job_management/_df_schema.py | 22 ++++++++++++--------- openeo/extra/job_management/_interface.py | 14 ++----------- openeo/extra/job_management/_job_db.py | 10 ++++++---- openeo/extra/job_management/_manager.py | 21 ++++++++++---------- tests/extra/job_management/test_job_db.py | 5 ++--- tests/extra/job_management/test_manager.py | 23 +++++++++++----------- 7 files changed, 53 insertions(+), 56 deletions(-) diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py index dd42ac1a8..eda8b1ff7 100644 --- a/openeo/extra/job_management/__init__.py +++ b/openeo/extra/job_management/__init__.py @@ -1,7 +1,13 @@ from openeo.extra.job_management._interface import JobDatabaseInterface -from openeo.extra.job_management._job_db import FullDataFrameJobDatabase, CsvJobDatabase, ParquetJobDatabase, create_job_db, get_job_db -from openeo.extra.job_management.process_based import ProcessBasedJobCreator +from openeo.extra.job_management._job_db import ( + CsvJobDatabase, + FullDataFrameJobDatabase, + ParquetJobDatabase, + create_job_db, + get_job_db, +) from openeo.extra.job_management._manager import MultiBackendJobManager +from openeo.extra.job_management.process_based import ProcessBasedJobCreator __all__ = [ "JobDatabaseInterface", @@ -11,7 +17,5 @@ "ProcessBasedJobCreator", "create_job_db", "get_job_db", - "MultiBackendJobManager" + "MultiBackendJobManager", ] - - diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py index 5140a5d04..a17015390 100644 --- a/openeo/extra/job_management/_df_schema.py +++ b/openeo/extra/job_management/_df_schema.py @@ -1,7 +1,9 @@ import dataclasses from typing import Any, Mapping + import pandas as pd + @dataclasses.dataclass(frozen=True) class _ColumnProperties: """Expected/required properties of a column in the job manager related dataframes""" @@ -9,6 +11,7 @@ class _ColumnProperties: dtype: str = "object" default: Any = None + # Expected columns in the job DB dataframes. # TODO: make this part of public API when settled? # TODO: move non official statuses to seperate column (not_started, queued_for_start) @@ -28,15 +31,16 @@ class _ColumnProperties: "costs": _ColumnProperties(dtype="float64"), } + def _normalize(df: pd.DataFrame) -> pd.DataFrame: - """ - Normalize given pandas dataframe (creating a new one): - ensure we have the required columns. + """ + Normalize given pandas dataframe (creating a new one): + ensure we have the required columns. - :param df: The dataframe to normalize. - :return: a new dataframe that is normalized. - """ - new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns} - df = df.assign(**new_columns) + :param df: The dataframe to normalize. + :return: a new dataframe that is normalized. + """ + new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns} + df = df.assign(**new_columns) - return df \ No newline at end of file + return df diff --git a/openeo/extra/job_management/_interface.py b/openeo/extra/job_management/_interface.py index 32f6c9d0b..215a1cd3a 100644 --- a/openeo/extra/job_management/_interface.py +++ b/openeo/extra/job_management/_interface.py @@ -1,14 +1,9 @@ import abc -from typing import ( - Callable, - Iterable, - List, - NamedTuple, - Union, -) +from typing import Iterable, List, Union import pandas as pd + class JobDatabaseInterface(metaclass=abc.ABCMeta): """ Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, @@ -68,8 +63,3 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: :return: DataFrame with jobs filtered by indices. """ ... - - - - - diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py index b0bb43ab9..a0be53443 100644 --- a/openeo/extra/job_management/_job_db.py +++ b/openeo/extra/job_management/_job_db.py @@ -1,16 +1,18 @@ import abc import logging -import shapely.errors -import shapely.wkt from pathlib import Path from typing import Iterable, Union + import pandas as pd +import shapely.errors +import shapely.wkt +from openeo.extra.job_management._df_schema import _COLUMN_REQUIREMENTS, _normalize from openeo.extra.job_management._interface import JobDatabaseInterface -from openeo.extra.job_management._df_schema import _normalize, _COLUMN_REQUIREMENTS _log = logging.getLogger(__name__) + class FullDataFrameJobDatabase(JobDatabaseInterface): def __init__(self): super().__init__() @@ -241,4 +243,4 @@ def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = job_db.initialize_from_df(df=df, on_exists=on_exists) else: raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") - return job_db \ No newline at end of file + return job_db diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py index 265ab6277..a76d3a7ff 100644 --- a/openeo/extra/job_management/_manager.py +++ b/openeo/extra/job_management/_manager.py @@ -1,6 +1,5 @@ import collections import contextlib -import dataclasses import datetime import json import logging @@ -25,28 +24,31 @@ from urllib3.util import Retry from openeo import BatchJob, Connection +from openeo.extra.job_management._df_schema import _normalize +from openeo.extra.job_management._interface import JobDatabaseInterface +from openeo.extra.job_management._job_db import get_job_db from openeo.extra.job_management._thread_worker import ( _JobManagerWorkerThreadPool, _JobStartTask, ) -from openeo.extra.job_management._interface import JobDatabaseInterface -from openeo.extra.job_management._job_db import get_job_db -from openeo.extra.job_management._df_schema import _normalize - from openeo.rest import OpenEoApiError from openeo.rest.auth.auth import BearerAuth from openeo.util import deep_get, rfc3339 - _log = logging.getLogger(__name__) + MAX_RETRIES = 50 + + # Sentinel value to indicate that a parameter was not set _UNSET = object() + def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs): raise NotImplementedError("No 'start_job' callable provided") + class _Backend(NamedTuple): """Container for backend info/settings""" @@ -56,8 +58,6 @@ class _Backend(NamedTuple): parallel_jobs: int - - class MultiBackendJobManager: """ Tracker for multiple jobs on multiple backends. @@ -125,7 +125,6 @@ def start_job( Added ``cancel_running_job_after`` parameter. """ - def __init__( self, poll_sleep: int = 60, @@ -409,7 +408,7 @@ def run_jobs( assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}" if isinstance(job_db, (str, Path)): - job_db = get_job_db(path=job_db) #TODO circular import + job_db = get_job_db(path=job_db) # TODO circular import if not isinstance(job_db, JobDatabaseInterface): raise ValueError(f"Unsupported job_db {job_db!r}") @@ -806,7 +805,7 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] = job_db.persist(active) return jobs_done, jobs_error, jobs_cancel - + def _format_usage_stat(job_metadata: dict, field: str) -> str: value = deep_get(job_metadata, "usage", field, "value", default=0) diff --git a/tests/extra/job_management/test_job_db.py b/tests/extra/job_management/test_job_db.py index 53ccce8d5..d1f6c5a8f 100644 --- a/tests/extra/job_management/test_job_db.py +++ b/tests/extra/job_management/test_job_db.py @@ -1,5 +1,5 @@ - import re + import geopandas # TODO: can we avoid using httpretty? @@ -18,7 +18,6 @@ create_job_db, get_job_db, ) - from openeo.utils.version import ComparableVersion JOB_DB_DF_BASICS = pd.DataFrame( @@ -378,4 +377,4 @@ def test_create_job_db(tmp_path, filename, expected): path = tmp_path / filename db = create_job_db(path=path, df=df) assert isinstance(db, expected) - assert path.exists() \ No newline at end of file + assert path.exists() diff --git a/tests/extra/job_management/test_manager.py b/tests/extra/job_management/test_manager.py index 58fcf2245..ffc06e049 100644 --- a/tests/extra/job_management/test_manager.py +++ b/tests/extra/job_management/test_manager.py @@ -26,16 +26,15 @@ import openeo from openeo import BatchJob -from openeo.extra.job_management._manager import ( - MAX_RETRIES, - MultiBackendJobManager, -) - from openeo.extra.job_management._job_db import ( CsvJobDatabase, ParquetJobDatabase, create_job_db, ) +from openeo.extra.job_management._manager import ( + MAX_RETRIES, + MultiBackendJobManager, +) from openeo.extra.job_management._thread_worker import ( Task, _JobManagerWorkerThreadPool, @@ -604,7 +603,9 @@ def get_status(job_id, current_status): job_db_path = tmp_path / "jobs.csv" # Mock sleep() to not actually sleep, but skip one hour at a time - with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)): + with mock.patch.object( + openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60) + ): job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path) final_df = CsvJobDatabase(job_db_path).read() @@ -723,7 +724,9 @@ def get_status(job_id, current_status): job_db_path = tmp_path / "jobs.csv" # Mock sleep() to skip one hour at a time instead of actually sleeping - with mock.patch.object(openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60)): + with mock.patch.object( + openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60) + ): job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path) final_df = CsvJobDatabase(job_db_path).read() @@ -865,9 +868,8 @@ def test_refresh_bearer_token_before_start( dummy_backend_bar, job_manager_root_dir, sleep_mock, - requests_mock + requests_mock, ): - client_id = "client123" client_secret = "$3cr3t" oidc_issuer = "https://oidc.test/" @@ -902,6 +904,3 @@ def test_refresh_bearer_token_before_start( # Because of proactive+throttled token refreshing, # we should have 2 additional token requests now assert len(oidc_mock.grant_request_history) == 4 - - - From 71971de40ec33f9629ca1b62ff41e973aa6037d1 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Wed, 5 Nov 2025 19:11:31 +0100 Subject: [PATCH 10/10] Tighter encapsulation of "column requirements" of MultiBackendJobManager #741/#815 Using another solution to break the import cycle (and some doc/test tweaks along the way) --- openeo/extra/job_management/_df_schema.py | 46 ------------ openeo/extra/job_management/_interface.py | 3 +- openeo/extra/job_management/_job_db.py | 12 ++-- openeo/extra/job_management/_manager.py | 83 ++++++++++++++++++---- openeo/extra/job_management/stac_job_db.py | 8 +-- tests/extra/job_management/test_job_db.py | 7 -- tests/extra/job_management/test_manager.py | 15 ++-- 7 files changed, 87 insertions(+), 87 deletions(-) delete mode 100644 openeo/extra/job_management/_df_schema.py diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py deleted file mode 100644 index a17015390..000000000 --- a/openeo/extra/job_management/_df_schema.py +++ /dev/null @@ -1,46 +0,0 @@ -import dataclasses -from typing import Any, Mapping - -import pandas as pd - - -@dataclasses.dataclass(frozen=True) -class _ColumnProperties: - """Expected/required properties of a column in the job manager related dataframes""" - - dtype: str = "object" - default: Any = None - - -# Expected columns in the job DB dataframes. -# TODO: make this part of public API when settled? -# TODO: move non official statuses to seperate column (not_started, queued_for_start) -_COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = { - "id": _ColumnProperties(dtype="str"), - "backend_name": _ColumnProperties(dtype="str"), - "status": _ColumnProperties(dtype="str", default="not_started"), - # TODO: use proper date/time dtype instead of legacy str for start times? - "start_time": _ColumnProperties(dtype="str"), - "running_start_time": _ColumnProperties(dtype="str"), - # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager, - # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`. - # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit? - "cpu": _ColumnProperties(dtype="str"), - "memory": _ColumnProperties(dtype="str"), - "duration": _ColumnProperties(dtype="str"), - "costs": _ColumnProperties(dtype="float64"), -} - - -def _normalize(df: pd.DataFrame) -> pd.DataFrame: - """ - Normalize given pandas dataframe (creating a new one): - ensure we have the required columns. - - :param df: The dataframe to normalize. - :return: a new dataframe that is normalized. - """ - new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns} - df = df.assign(**new_columns) - - return df diff --git a/openeo/extra/job_management/_interface.py b/openeo/extra/job_management/_interface.py index 215a1cd3a..6f7b6a9cd 100644 --- a/openeo/extra/job_management/_interface.py +++ b/openeo/extra/job_management/_interface.py @@ -6,7 +6,8 @@ class JobDatabaseInterface(metaclass=abc.ABCMeta): """ - Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, + Interface for a database of job metadata to use with the + :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager`, allowing to regularly persist the job metadata while polling the job statuses and resume/restart the job tracking after it was interrupted. diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py index a0be53443..8e18d5c98 100644 --- a/openeo/extra/job_management/_job_db.py +++ b/openeo/extra/job_management/_job_db.py @@ -7,7 +7,7 @@ import shapely.errors import shapely.wkt -from openeo.extra.job_management._df_schema import _COLUMN_REQUIREMENTS, _normalize +import openeo.extra.job_management._manager from openeo.extra.job_management._interface import JobDatabaseInterface _log = logging.getLogger(__name__) @@ -22,7 +22,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): """ Initialize the job database from a given dataframe, which will be first normalized to be compatible - with :py:class:`MultiBackendJobManager` usage. + with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager` usage. :param df: dataframe with some columns your ``start_job`` callable expects :param on_exists: what to do when the job database already exists (persisted on disk): @@ -42,7 +42,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): else: # TODO handle other on_exists modes: e.g. overwrite, merge, ... raise ValueError(f"Invalid on_exists={on_exists!r}") - df = _normalize(df) + df = openeo.extra.job_management._manager.MultiBackendJobManager._column_requirements.normalize_df(df) self.persist(df) # Return self to allow chaining with constructor. return self @@ -104,7 +104,7 @@ class CsvJobDatabase(FullDataFrameJobDatabase): """ Persist/load job metadata with a CSV file. - :implements: :py:class:`JobDatabaseInterface` + :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` :param path: Path to local CSV file. .. note:: @@ -135,7 +135,7 @@ def read(self) -> pd.DataFrame: df = pd.read_csv( self.path, # TODO: possible to avoid hidden coupling with MultiBackendJobManager here? - dtype={c: r.dtype for (c, r) in _COLUMN_REQUIREMENTS.items()}, + dtype=openeo.extra.job_management._manager.MultiBackendJobManager._column_requirements.dtype_mapping(), ) if ( "geometry" in df.columns @@ -159,7 +159,7 @@ class ParquetJobDatabase(FullDataFrameJobDatabase): """ Persist/load job metadata with a Parquet file. - :implements: :py:class:`JobDatabaseInterface` + :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` :param path: Path to the Parquet file. .. note:: diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py index a76d3a7ff..e264c8731 100644 --- a/openeo/extra/job_management/_manager.py +++ b/openeo/extra/job_management/_manager.py @@ -1,5 +1,6 @@ import collections import contextlib +import dataclasses import datetime import json import logging @@ -12,6 +13,7 @@ Callable, Dict, List, + Mapping, NamedTuple, Optional, Tuple, @@ -23,10 +25,10 @@ from requests.adapters import HTTPAdapter from urllib3.util import Retry +# TODO avoid this (circular) dependency on _job_db? +import openeo.extra.job_management._job_db from openeo import BatchJob, Connection -from openeo.extra.job_management._df_schema import _normalize from openeo.extra.job_management._interface import JobDatabaseInterface -from openeo.extra.job_management._job_db import get_job_db from openeo.extra.job_management._thread_worker import ( _JobManagerWorkerThreadPool, _JobStartTask, @@ -38,6 +40,7 @@ _log = logging.getLogger(__name__) +# TODO: eliminate this module constant (should be part of some constructor interface) MAX_RETRIES = 50 @@ -58,6 +61,45 @@ class _Backend(NamedTuple): parallel_jobs: int +@dataclasses.dataclass(frozen=True) +class _ColumnProperties: + """Expected/required properties of a column in the job manager related dataframes""" + + dtype: str = "object" + default: Any = None + + +class _ColumnRequirements: + """ + Helper class to encapsulate the column requirements expected by MultiBackendJobManager. + The current implementation (e.g. _job_db) has some undesired coupling here, + but it turns out quite hard to eliminate. + The goal of this class is, currently, to at least make the coupling explicit + in a centralized way. + """ + + def __init__(self, requirements: Mapping[str, _ColumnProperties]): + self._requirements = dict(requirements) + + def normalize_df(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Normalize given pandas dataframe (creating a new one): + ensure we have the required columns. + + :param df: The dataframe to normalize. + :return: a new dataframe that is normalized. + """ + new_columns = {col: req.default for (col, req) in self._requirements.items() if col not in df.columns} + df = df.assign(**new_columns) + return df + + def dtype_mapping(self) -> Dict[str, str]: + """ + Get mapping of column name to expected dtype string, e.g. to be used with pandas.read_csv(dtype=...) + """ + return {col: req.dtype for (col, req) in self._requirements.items()} + + class MultiBackendJobManager: """ Tracker for multiple jobs on multiple backends. @@ -125,6 +167,27 @@ def start_job( Added ``cancel_running_job_after`` parameter. """ + # Expected columns in the job DB dataframes. + # TODO: make this part of public API when settled? + # TODO: move non official statuses to separate column (not_started, queued_for_start) + _column_requirements: _ColumnRequirements = _ColumnRequirements( + { + "id": _ColumnProperties(dtype="str"), + "backend_name": _ColumnProperties(dtype="str"), + "status": _ColumnProperties(dtype="str", default="not_started"), + # TODO: use proper date/time dtype instead of legacy str for start times? + "start_time": _ColumnProperties(dtype="str"), + "running_start_time": _ColumnProperties(dtype="str"), + # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager, + # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`. + # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit? + "cpu": _ColumnProperties(dtype="str"), + "memory": _ColumnProperties(dtype="str"), + "duration": _ColumnProperties(dtype="str"), + "costs": _ColumnProperties(dtype="float64"), + } + ) + def __init__( self, poll_sleep: int = 60, @@ -227,13 +290,9 @@ def _make_resilient(connection): @classmethod def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame: """ - Normalize given pandas dataframe (creating a new one): - ensure we have the required columns. - - :param df: The dataframe to normalize. - :return: a new dataframe that is normalized. + Deprecated, but kept for backwards compatibility """ - return _normalize(df) + return cls._column_requirements.normalize_df(df) def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface): """ @@ -268,7 +327,7 @@ def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabas :param job_db: Job database to load/store existing job status data and other metadata from/to. Can be specified as a path to CSV or Parquet file, - or as a custom database object following the :py:class:`JobDatabaseInterface` interface. + or as a custom database object following the :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` interface. .. note:: Support for Parquet files depends on the ``pyarrow`` package @@ -373,7 +432,7 @@ def run_jobs( :param job_db: Job database to load/store existing job status data and other metadata from/to. Can be specified as a path to CSV or Parquet file, - or as a custom database object following the :py:class:`JobDatabaseInterface` interface. + or as a custom database object following the :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` interface. .. note:: Support for Parquet files depends on the ``pyarrow`` package @@ -389,7 +448,7 @@ def run_jobs( .. versionchanged:: 0.31.0 Replace ``output_file`` argument with ``job_db`` argument, which can be a path to a CSV or Parquet file, - or a user-defined :py:class:`JobDatabaseInterface` object. + or a user-defined :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` object. The deprecated ``output_file`` argument is still supported for now. .. versionchanged:: 0.33.0 @@ -408,7 +467,7 @@ def run_jobs( assert not kwargs, f"Unexpected keyword arguments: {kwargs!r}" if isinstance(job_db, (str, Path)): - job_db = get_job_db(path=job_db) # TODO circular import + job_db = openeo.extra.job_management._job_db.get_job_db(path=job_db) if not isinstance(job_db, JobDatabaseInterface): raise ValueError(f"Unsupported job_db {job_db!r}") diff --git a/openeo/extra/job_management/stac_job_db.py b/openeo/extra/job_management/stac_job_db.py index bcd25da12..9f1dff8f5 100644 --- a/openeo/extra/job_management/stac_job_db.py +++ b/openeo/extra/job_management/stac_job_db.py @@ -22,7 +22,7 @@ class STACAPIJobDatabase(JobDatabaseInterface): Unstable API, subject to change. - :implements: :py:class:`JobDatabaseInterface` + :implements: :py:class:`~openeo.extra.job_management._interface.JobDatabaseInterface` """ def __init__( @@ -56,10 +56,10 @@ def exists(self) -> bool: def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame: """ - Normalize the given dataframe to be compatible with :py:class:`MultiBackendJobManager` + Normalize the given dataframe to be compatible with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager` by adding the default columns and setting the index. """ - df = MultiBackendJobManager._normalize_df(df) + df = MultiBackendJobManager._column_requirements.normalize_df(df) # If the user doesn't specify the item_id column, we will use the index. if "item_id" not in df.columns: df = df.reset_index(names=["item_id"]) @@ -69,7 +69,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"): """ Initialize the job database from a given dataframe, which will be first normalized to be compatible - with :py:class:`MultiBackendJobManager` usage. + with :py:class:`~openeo.extra.job_management._manager.MultiBackendJobManager` usage. :param df: dataframe with some columns your ``start_job`` callable expects :param on_exists: what to do when the job database already exists (persisted on disk): diff --git a/tests/extra/job_management/test_job_db.py b/tests/extra/job_management/test_job_db.py index d1f6c5a8f..efd1a09e0 100644 --- a/tests/extra/job_management/test_job_db.py +++ b/tests/extra/job_management/test_job_db.py @@ -1,13 +1,6 @@ import re import geopandas - -# TODO: can we avoid using httpretty? -# We need it for testing the resilience, which uses an HTTPadapter with Retry -# but requests-mock also uses an HTTPAdapter for the mocking and basically -# erases the HTTPAdapter we have set up. -# httpretty avoids this specific problem because it mocks at the socket level, -# But I would rather not have two dependencies with almost the same goal. import pandas as pd import pytest import shapely.geometry diff --git a/tests/extra/job_management/test_manager.py b/tests/extra/job_management/test_manager.py index ffc06e049..ea5873f11 100644 --- a/tests/extra/job_management/test_manager.py +++ b/tests/extra/job_management/test_manager.py @@ -31,10 +31,7 @@ ParquetJobDatabase, create_job_db, ) -from openeo.extra.job_management._manager import ( - MAX_RETRIES, - MultiBackendJobManager, -) +from openeo.extra.job_management._manager import MAX_RETRIES, MultiBackendJobManager from openeo.extra.job_management._thread_worker import ( Task, _JobManagerWorkerThreadPool, @@ -296,7 +293,7 @@ def test_start_job_thread_basic(self, tmp_path, job_manager, job_manager_root_di def test_normalize_df(self): df = pd.DataFrame({"some_number": [3, 2, 1]}) - df_normalized = MultiBackendJobManager._normalize_df(df) + df_normalized = MultiBackendJobManager._column_requirements.normalize_df(df) assert set(df_normalized.columns) == set( [ "some_number", @@ -603,9 +600,7 @@ def get_status(job_id, current_status): job_db_path = tmp_path / "jobs.csv" # Mock sleep() to not actually sleep, but skip one hour at a time - with mock.patch.object( - openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60) - ): + with mock.patch("time.sleep", new=lambda s: time_machine.shift(60 * 60)): job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path) final_df = CsvJobDatabase(job_db_path).read() @@ -724,9 +719,7 @@ def get_status(job_id, current_status): job_db_path = tmp_path / "jobs.csv" # Mock sleep() to skip one hour at a time instead of actually sleeping - with mock.patch.object( - openeo.extra.job_management._manager.time, "sleep", new=lambda s: time_machine.shift(60 * 60) - ): + with mock.patch("time.sleep", new=lambda s: time_machine.shift(60 * 60)): job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path) final_df = CsvJobDatabase(job_db_path).read()