|
13 | 13 | Callable, |
14 | 14 | Dict, |
15 | 15 | List, |
16 | | - Mapping, |
17 | 16 | NamedTuple, |
18 | 17 | Optional, |
19 | 18 | Tuple, |
|
31 | 30 | _JobStartTask, |
32 | 31 | ) |
33 | 32 | from openeo.extra.job_management.process_based_job_creator import ProcessBasedJobCreator |
34 | | -from openeo.extra.job_management.dataframe_job_db import JobDatabaseInterface, FullDataFrameJobDatabase, ParquetJobDatabase, CsvJobDatabase |
| 33 | +from openeo.extra.job_management._job_database import FullDataFrameJobDatabase, JobDatabaseInterface, ParquetJobDatabase, CsvJobDatabase, create_job_db, get_job_db |
| 34 | +from openeo.extra.job_management._dataframe_utils import normalize_dataframe |
| 35 | + |
35 | 36 |
|
36 | 37 | from openeo.rest import OpenEoApiError |
37 | 38 | from openeo.rest.auth.auth import BearerAuth |
|
44 | 45 | "FullDataFrameJobDatabase", |
45 | 46 | "ParquetJobDatabase", |
46 | 47 | "CsvJobDatabase", |
| 48 | + "ProcessBasedJobCreator", |
| 49 | + "create_job_db", |
| 50 | + "get_job_db", |
| 51 | + |
47 | 52 | ] |
48 | 53 |
|
49 | 54 | class _Backend(NamedTuple): |
@@ -73,6 +78,8 @@ class _ColumnProperties: |
73 | 78 | default: Any = None |
74 | 79 |
|
75 | 80 |
|
| 81 | + |
| 82 | + |
76 | 83 | class MultiBackendJobManager: |
77 | 84 | """ |
78 | 85 | Tracker for multiple jobs on multiple backends. |
@@ -143,21 +150,7 @@ def start_job( |
143 | 150 | # Expected columns in the job DB dataframes. |
144 | 151 | # TODO: make this part of public API when settled? |
145 | 152 | # TODO: move non official statuses to seperate column (not_started, queued_for_start) |
146 | | - _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = { |
147 | | - "id": _ColumnProperties(dtype="str"), |
148 | | - "backend_name": _ColumnProperties(dtype="str"), |
149 | | - "status": _ColumnProperties(dtype="str", default="not_started"), |
150 | | - # TODO: use proper date/time dtype instead of legacy str for start times? |
151 | | - "start_time": _ColumnProperties(dtype="str"), |
152 | | - "running_start_time": _ColumnProperties(dtype="str"), |
153 | | - # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager, |
154 | | - # but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`. |
155 | | - # Since bfd99e34 they are not really required to be present anymore, can we make that more explicit? |
156 | | - "cpu": _ColumnProperties(dtype="str"), |
157 | | - "memory": _ColumnProperties(dtype="str"), |
158 | | - "duration": _ColumnProperties(dtype="str"), |
159 | | - "costs": _ColumnProperties(dtype="float64"), |
160 | | - } |
| 153 | + |
161 | 154 |
|
162 | 155 | def __init__( |
163 | 156 | self, |
@@ -258,17 +251,8 @@ def _make_resilient(connection): |
258 | 251 |
|
259 | 252 | @classmethod |
260 | 253 | def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame: |
261 | | - """ |
262 | | - Normalize given pandas dataframe (creating a new one): |
263 | | - ensure we have the required columns. |
264 | | -
|
265 | | - :param df: The dataframe to normalize. |
266 | | - :return: a new dataframe that is normalized. |
267 | | - """ |
268 | | - new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns} |
269 | | - df = df.assign(**new_columns) |
270 | 254 |
|
271 | | - return df |
| 255 | + return normalize_dataframe(df) |
272 | 256 |
|
273 | 257 | def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface): |
274 | 258 | """ |
@@ -844,44 +828,3 @@ def ignore_connection_errors(context: Optional[str] = None, sleep: int = 5): |
844 | 828 |
|
845 | 829 |
|
846 | 830 |
|
847 | | -def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface: |
848 | | - """ |
849 | | - Factory to get a job database at a given path, |
850 | | - guessing the database type from filename extension. |
851 | | -
|
852 | | - :param path: path to job database file. |
853 | | -
|
854 | | - .. versionadded:: 0.33.0 |
855 | | - """ |
856 | | - path = Path(path) |
857 | | - if path.suffix.lower() in {".csv"}: |
858 | | - job_db = CsvJobDatabase(path=path) |
859 | | - elif path.suffix.lower() in {".parquet", ".geoparquet"}: |
860 | | - job_db = ParquetJobDatabase(path=path) |
861 | | - else: |
862 | | - raise ValueError(f"Could not guess job database type from {path!r}") |
863 | | - return job_db |
864 | | - |
865 | | - |
866 | | -def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): |
867 | | - """ |
868 | | - Factory to create a job database at given path, |
869 | | - initialized from a given dataframe, |
870 | | - and its database type guessed from filename extension. |
871 | | -
|
872 | | - :param path: Path to the job database file. |
873 | | - :param df: DataFrame to store in the job database. |
874 | | - :param on_exists: What to do when the job database already exists: |
875 | | - - "error": (default) raise an exception |
876 | | - - "skip": work with existing database, ignore given dataframe and skip any initialization |
877 | | -
|
878 | | - .. versionadded:: 0.33.0 |
879 | | - """ |
880 | | - job_db = get_job_db(path) |
881 | | - if isinstance(job_db, FullDataFrameJobDatabase): |
882 | | - job_db.initialize_from_df(df=df, on_exists=on_exists) |
883 | | - else: |
884 | | - raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.") |
885 | | - return job_db |
886 | | - |
887 | | - |
0 commit comments