11import collections
22import contextlib
3+ import dataclasses
34import datetime
45import json
56import logging
1213 Callable ,
1314 Dict ,
1415 List ,
16+ Mapping ,
1517 NamedTuple ,
1618 Optional ,
1719 Tuple ,
2325from requests .adapters import HTTPAdapter
2426from urllib3 .util import Retry
2527
28+ # TODO avoid this (circular) dependency on _job_db?
29+ import openeo .extra .job_management ._job_db
2630from openeo import BatchJob , Connection
27- from openeo .extra .job_management ._df_schema import _normalize
2831from openeo .extra .job_management ._interface import JobDatabaseInterface
29- from openeo .extra .job_management ._job_db import get_job_db
3032from openeo .extra .job_management ._thread_worker import (
3133 _JobManagerWorkerThreadPool ,
3234 _JobStartTask ,
3840_log = logging .getLogger (__name__ )
3941
4042
43+ # TODO: eliminate this module constant (should be part of some constructor interface)
4144MAX_RETRIES = 50
4245
4346
@@ -58,6 +61,45 @@ class _Backend(NamedTuple):
5861 parallel_jobs : int
5962
6063
64+ @dataclasses .dataclass (frozen = True )
65+ class _ColumnProperties :
66+ """Expected/required properties of a column in the job manager related dataframes"""
67+
68+ dtype : str = "object"
69+ default : Any = None
70+
71+
72+ class _ColumnRequirements :
73+ """
74+ Helper class to encapsulate the column requirements expected by MultiBackendJobManager.
75+ The current implementation (e.g. _job_db) has some undesired coupling here,
76+ but it turns out quite hard to eliminate.
77+ The goal of this class is, currently, to at least make the coupling explicit
78+ in a centralized way.
79+ """
80+
81+ def __init__ (self , requirements : Mapping [str , _ColumnProperties ]):
82+ self ._requirements = dict (requirements )
83+
84+ def normalize_df (self , df : pd .DataFrame ) -> pd .DataFrame :
85+ """
86+ Normalize given pandas dataframe (creating a new one):
87+ ensure we have the required columns.
88+
89+ :param df: The dataframe to normalize.
90+ :return: a new dataframe that is normalized.
91+ """
92+ new_columns = {col : req .default for (col , req ) in self ._requirements .items () if col not in df .columns }
93+ df = df .assign (** new_columns )
94+ return df
95+
96+ def dtype_mapping (self ) -> Dict [str , str ]:
97+ """
98+ Get mapping of column name to expected dtype string, e.g. to be used with pandas.read_csv(dtype=...)
99+ """
100+ return {col : req .dtype for (col , req ) in self ._requirements .items ()}
101+
102+
61103class MultiBackendJobManager :
62104 """
63105 Tracker for multiple jobs on multiple backends.
@@ -125,6 +167,27 @@ def start_job(
125167 Added ``cancel_running_job_after`` parameter.
126168 """
127169
170+ # Expected columns in the job DB dataframes.
171+ # TODO: make this part of public API when settled?
172+ # TODO: move non official statuses to separate column (not_started, queued_for_start)
173+ _column_requirements : _ColumnRequirements = _ColumnRequirements (
174+ {
175+ "id" : _ColumnProperties (dtype = "str" ),
176+ "backend_name" : _ColumnProperties (dtype = "str" ),
177+ "status" : _ColumnProperties (dtype = "str" , default = "not_started" ),
178+ # TODO: use proper date/time dtype instead of legacy str for start times?
179+ "start_time" : _ColumnProperties (dtype = "str" ),
180+ "running_start_time" : _ColumnProperties (dtype = "str" ),
181+ # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
182+ # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
183+ # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
184+ "cpu" : _ColumnProperties (dtype = "str" ),
185+ "memory" : _ColumnProperties (dtype = "str" ),
186+ "duration" : _ColumnProperties (dtype = "str" ),
187+ "costs" : _ColumnProperties (dtype = "float64" ),
188+ }
189+ )
190+
128191 def __init__ (
129192 self ,
130193 poll_sleep : int = 60 ,
@@ -227,13 +290,9 @@ def _make_resilient(connection):
227290 @classmethod
228291 def _normalize_df (cls , df : pd .DataFrame ) -> pd .DataFrame :
229292 """
230- Normalize given pandas dataframe (creating a new one):
231- ensure we have the required columns.
232-
233- :param df: The dataframe to normalize.
234- :return: a new dataframe that is normalized.
293+ Deprecated, but kept for backwards compatibility
235294 """
236- return _normalize (df )
295+ return cls . _column_requirements . normalize_df (df )
237296
238297 def start_job_thread (self , start_job : Callable [[], BatchJob ], job_db : JobDatabaseInterface ):
239298 """
@@ -268,7 +327,7 @@ def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabas
268327 :param job_db:
269328 Job database to load/store existing job status data and other metadata from/to.
270329 Can be specified as a path to CSV or Parquet file,
271- or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
330+ or as a custom database object following the :py:class:`~openeo.extra.job_management._interface. JobDatabaseInterface` interface.
272331
273332 .. note::
274333 Support for Parquet files depends on the ``pyarrow`` package
@@ -373,7 +432,7 @@ def run_jobs(
373432 :param job_db:
374433 Job database to load/store existing job status data and other metadata from/to.
375434 Can be specified as a path to CSV or Parquet file,
376- or as a custom database object following the :py:class:`JobDatabaseInterface` interface.
435+ or as a custom database object following the :py:class:`~openeo.extra.job_management._interface. JobDatabaseInterface` interface.
377436
378437 .. note::
379438 Support for Parquet files depends on the ``pyarrow`` package
@@ -389,7 +448,7 @@ def run_jobs(
389448 .. versionchanged:: 0.31.0
390449 Replace ``output_file`` argument with ``job_db`` argument,
391450 which can be a path to a CSV or Parquet file,
392- or a user-defined :py:class:`JobDatabaseInterface` object.
451+ or a user-defined :py:class:`~openeo.extra.job_management._interface. JobDatabaseInterface` object.
393452 The deprecated ``output_file`` argument is still supported for now.
394453
395454 .. versionchanged:: 0.33.0
@@ -408,7 +467,7 @@ def run_jobs(
408467 assert not kwargs , f"Unexpected keyword arguments: { kwargs !r} "
409468
410469 if isinstance (job_db , (str , Path )):
411- job_db = get_job_db (path = job_db ) # TODO circular import
470+ job_db = openeo . extra . job_management . _job_db . get_job_db (path = job_db )
412471
413472 if not isinstance (job_db , JobDatabaseInterface ):
414473 raise ValueError (f"Unsupported job_db { job_db !r} " )
0 commit comments