|
12 | 12 | import shapely.errors |
13 | 13 | import shapely.wkt |
14 | 14 |
|
15 | | -from openeo.extra.job_management._dataframe_utils import normalize_dataframe, COLUMN_REQUIREMENTS |
16 | | - |
17 | 15 |
|
18 | 16 |
|
19 | 17 | _log = logging.getLogger(__name__) |
20 | 18 |
|
| 19 | +import pandas as pd |
| 20 | + |
| 21 | +class _ColumnProperties: |
| 22 | + def __init__(self, dtype: str, default=None): |
| 23 | + self.dtype = dtype |
| 24 | + self.default = default |
| 25 | + |
| 26 | + |
| 27 | + |
21 | 28 | class JobDatabaseInterface(metaclass=abc.ABCMeta): |
22 | 29 | """ |
23 | 30 | Interface for a database of job metadata to use with the :py:class:`MultiBackendJobManager`, |
@@ -78,7 +85,20 @@ def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame: |
78 | 85 | """ |
79 | 86 | ... |
80 | 87 |
|
81 | | - |
| 88 | +# Expected columns in the job DB dataframes. |
| 89 | +# TODO: make this part of public API when settled? |
| 90 | +# TODO: move non official statuses to seperate column (not_started, queued_for_start) |
| 91 | +COLUMN_REQUIREMENTS = { |
| 92 | + "id": _ColumnProperties(dtype="str"), |
| 93 | + "backend_name": _ColumnProperties(dtype="str"), |
| 94 | + "status": _ColumnProperties(dtype="str", default="not_started"), |
| 95 | + "start_time": _ColumnProperties(dtype="str"), |
| 96 | + "running_start_time": _ColumnProperties(dtype="str"), |
| 97 | + "cpu": _ColumnProperties(dtype="str"), |
| 98 | + "memory": _ColumnProperties(dtype="str"), |
| 99 | + "duration": _ColumnProperties(dtype="str"), |
| 100 | + "costs": _ColumnProperties(dtype="float64"), |
| 101 | +} |
82 | 102 |
|
83 | 103 | class FullDataFrameJobDatabase(JobDatabaseInterface): |
84 | 104 | def __init__(self): |
@@ -271,6 +291,18 @@ def persist(self, df: pd.DataFrame): |
271 | 291 | self.path.parent.mkdir(parents=True, exist_ok=True) |
272 | 292 | self.df.to_parquet(self.path, index=False) |
273 | 293 |
|
| 294 | +def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame: |
| 295 | + """ |
| 296 | + Normalize given pandas dataframe (creating a new one): |
| 297 | + ensure we have the required columns. |
| 298 | +
|
| 299 | + :param df: The dataframe to normalize. |
| 300 | + :return: a new dataframe that is normalized. |
| 301 | + """ |
| 302 | + new_columns = {col: req.default for (col, req) in COLUMN_REQUIREMENTS.items() if col not in df.columns} |
| 303 | + df = df.assign(**new_columns) |
| 304 | + return df |
| 305 | + |
274 | 306 | def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"): |
275 | 307 | """ |
276 | 308 | Factory to create a job database at given path, |
|
0 commit comments