Open-EO · soxofaan · Nov 14, 2024 · Nov 12, 2024 · Nov 13, 2024 · Nov 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `MultiBackendJobManager`: Fix encoding issue of job metadata in `on_job_done` ([#657](https://github.com/Open-EO/openeo-python-client/issues/657))
 - `MultiBackendJobManager`: Avoid `SettingWithCopyWarning` ([#641](https://github.com/Open-EO/openeo-python-client/issues/641))
 - Avoid creating empty file if asset download request failed.
+- `MultiBackendJobManager`: avoid dtype loading mistakes in `CsvJobDatabase` on empty columns ([#656](https://github.com/Open-EO/openeo-python-client/issues/656))
 
 
 ## [0.34.0] - 2024-10-31

diff --git a/openeo/extra/job_management.py b/openeo/extra/job_management.py
@@ -1,6 +1,7 @@
 import abc
 import collections
 import contextlib
+import dataclasses
 import datetime
 import json
 import logging
@@ -9,7 +10,7 @@
 import warnings
 from pathlib import Path
 from threading import Thread
-from typing import Callable, Dict, List, NamedTuple, Optional, Union
+from typing import Any, Callable, Dict, List, Mapping, NamedTuple, Optional, Union
 
 import numpy
 import pandas as pd
@@ -104,6 +105,14 @@ def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
     raise NotImplementedError("No 'start_job' callable provided")
 
 
+@dataclasses.dataclass(frozen=True)
+class _ColumnProperties:
+    """Expected/required properties of a column in the job manager related dataframes"""
+
+    dtype: str = "object"
+    default: Any = None
+
+
 class MultiBackendJobManager:
     """
     Tracker for multiple jobs on multiple backends.
@@ -171,6 +180,23 @@ def start_job(
         Added ``cancel_running_job_after`` parameter.
     """
 
+    # Expected columns in the job DB dataframes.
+    # TODO: make this part of public API when settled?
+    _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
+        "id": _ColumnProperties(dtype="str"),
+        "backend_name": _ColumnProperties(dtype="str"),
+        "status": _ColumnProperties(dtype="str", default="not_started"),
+        # TODO: use proper date/time dtype instead of legacy str for start times?
+        "start_time": _ColumnProperties(dtype="str"),
+        "running_start_time": _ColumnProperties(dtype="str"),
+        # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
+        #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
+        #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
+        "cpu": _ColumnProperties(dtype="str"),
+        "memory": _ColumnProperties(dtype="str"),
+        "duration": _ColumnProperties(dtype="str"),
+    }
+
     def __init__(
         self,
         poll_sleep: int = 60,
@@ -267,31 +293,16 @@ def _make_resilient(connection):
         connection.session.mount("https://", HTTPAdapter(max_retries=retries))
         connection.session.mount("http://", HTTPAdapter(max_retries=retries))
 
-    @staticmethod
-    def _normalize_df(df: pd.DataFrame) -> pd.DataFrame:
+    @classmethod
+    def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
         """
         Normalize given pandas dataframe (creating a new one):
         ensure we have the required columns.
 
         :param df: The dataframe to normalize.
         :return: a new dataframe that is normalized.
         """
-        # check for some required columns.
-        required_with_default = [
-            ("status", "not_started"),
-            ("id", None),
-            ("start_time", None),
-            ("running_start_time", None),
-            # TODO: columns "cpu", "memory", "duration" are not referenced directly
-            #       within MultiBackendJobManager making it confusing to claim they are required.
-            #       However, they are through assumptions about job "usage" metadata in `_track_statuses`.
-            #       => proposed solution: allow to configure usage columns when adding a backend
-            ("cpu", None),
-            ("memory", None),
-            ("duration", None),
-            ("backend_name", None),
-        ]
-        new_columns = {col: val for (col, val) in required_with_default if col not in df.columns}
+        new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns}
         df = df.assign(**new_columns)
 
         return df
@@ -486,6 +497,9 @@ def _job_update_loop(
         go through the necessary jobs to check for status updates,
         trigger status events, start new jobs when there is room for them, etc.
         """
+        if not self.backends:
+            raise RuntimeError("No backends registered")
+
         stats = stats if stats is not None else collections.defaultdict(int)
 
         with ignore_connection_errors(context="get statuses"):
@@ -832,7 +846,11 @@ def _is_valid_wkt(self, wkt: str) -> bool:
             return False
 
     def read(self) -> pd.DataFrame:
-        df = pd.read_csv(self.path)
+        df = pd.read_csv(
+            self.path,
+            # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
+            dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()},
+        )
         if (
             "geometry" in df.columns
             and df["geometry"].dtype.name != "geometry"

diff --git a/openeo/rest/_testing.py b/openeo/rest/_testing.py
@@ -3,7 +3,17 @@
 import collections
 import json
 import re
-from typing import Callable, Iterable, Iterator, Optional, Sequence, Tuple, Union
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 
 from openeo import Connection, DataCube
 from openeo.rest.vectorcube import VectorCube
@@ -32,7 +42,9 @@ class DummyBackend:
         "validation_requests",
         "next_result",
         "next_validation_errors",
+        "_forced_job_status",
         "job_status_updater",
+        "job_id_generator",
         "extra_job_metadata_fields",
     )
 
@@ -53,13 +65,20 @@ def __init__(
         self.next_result = self.DEFAULT_RESULT
         self.next_validation_errors = []
         self.extra_job_metadata_fields = []
+        self._forced_job_status: Dict[str, str] = {}
 
         # Job status update hook:
         #   callable that is called on starting a job, and getting job metadata
         #   allows to dynamically change how the status of a job evolves
         #   By default: immediately set to "finished" once job is started
         self.job_status_updater = lambda job_id, current_status: "finished"
 
+        # Optional job id generator hook:
+        #   callable that generates a job id, e.g. based on the process graph.
+        #   When set to None, or the callable returns None, or it returns an existing job id:
+        #   things fall back to auto-increment job ids ("job-000", "job-001", "job-002", ...)
+        self.job_id_generator: Optional[Callable[[dict], str]] = None
+
         requests_mock.post(
             connection.build_url("/result"),
             content=self._handle_post_result,
@@ -75,10 +94,18 @@ def __init__(
         requests_mock.get(
             re.compile(connection.build_url(r"/jobs/(job-\d+)/results$")), json=self._handle_get_job_results
         )
+        requests_mock.delete(
+            re.compile(connection.build_url(r"/jobs/(job-\d+)/results$")), json=self._handle_delete_job_results
+        )
         requests_mock.get(
             re.compile(connection.build_url("/jobs/(.*?)/results/result.data$")),
             content=self._handle_get_job_result_asset,
         )
+        requests_mock.get(
+            re.compile(connection.build_url(r"/jobs/(.*?)/logs($|\?.*)")),
+            # TODO: need to fine-tune dummy logs?
+            json={"logs": [], "links": []},
+        )
         requests_mock.post(connection.build_url("/validation"), json=self._handle_post_validation)
 
     @classmethod
@@ -88,7 +115,7 @@ def at_url(cls, root_url: str, *, requests_mock, capabilities: Optional[dict] =
         including creation of connection and mocking of capabilities doc
         """
         root_url = root_url.rstrip("/") + "/"
-        requests_mock.get(root_url, json=build_capabilities(**(capabilities or None)))
+        requests_mock.get(root_url, json=build_capabilities(**(capabilities or {})))
         connection = Connection(root_url)
         return cls(requests_mock=requests_mock, connection=connection)
 
@@ -150,7 +177,14 @@ def _handle_post_jobs(self, request, context):
         """handler of `POST /jobs` (create batch job)"""
         post_data = request.json()
         pg = post_data["process"]["process_graph"]
-        job_id = f"job-{len(self.batch_jobs):03d}"
+
+        # Generate (new) job id
+        job_id = self.job_id_generator and self.job_id_generator(process_graph=pg)
+        if not job_id or job_id in self.batch_jobs:
+            # As fallback: use auto-increment job ids ("job-000", "job-001", "job-002", ...)
+            job_id = f"job-{len(self.batch_jobs):03d}"
+        assert job_id not in self.batch_jobs
+
         job_data = {"job_id": job_id, "pg": pg, "status": "created"}
         for field in ["title", "description"]:
             if field in post_data:
@@ -169,11 +203,16 @@ def _get_job_id(self, request) -> str:
         assert job_id in self.batch_jobs
         return job_id
 
+    def _get_job_status(self, job_id: str, current_status: str) -> str:
+        if job_id in self._forced_job_status:
+            return self._forced_job_status[job_id]
+        return self.job_status_updater(job_id=job_id, current_status=current_status)
+
     def _handle_post_job_results(self, request, context):
         """Handler of `POST /job/{job_id}/results` (start batch job)."""
         job_id = self._get_job_id(request)
         assert self.batch_jobs[job_id]["status"] == "created"
-        self.batch_jobs[job_id]["status"] = self.job_status_updater(
+        self.batch_jobs[job_id]["status"] = self._get_job_status(
             job_id=job_id, current_status=self.batch_jobs[job_id]["status"]
         )
         context.status_code = 202
@@ -183,10 +222,14 @@ def _handle_get_job(self, request, context):
         job_id = self._get_job_id(request)
         # Allow updating status with `job_status_setter` once job got past status "created"
         if self.batch_jobs[job_id]["status"] != "created":
-            self.batch_jobs[job_id]["status"] = self.job_status_updater(
+            self.batch_jobs[job_id]["status"] = self._get_job_status(
                 job_id=job_id, current_status=self.batch_jobs[job_id]["status"]
             )
-        return {"id": job_id, "status": self.batch_jobs[job_id]["status"]}
+        return {
+            # TODO: add some more required fields like "process" and "created"?
+            "id": job_id,
+            "status": self.batch_jobs[job_id]["status"],
+        }
 
     def _handle_get_job_results(self, request, context):
         """Handler of `GET /job/{job_id}/results` (list batch job results)."""
@@ -197,6 +240,13 @@ def _handle_get_job_results(self, request, context):
             "assets": {"result.data": {"href": self.connection.build_url(f"/jobs/{job_id}/results/result.data")}},
         }
 
+    def _handle_delete_job_results(self, request, context):
+        """Handler of `DELETE /job/{job_id}/results` (cancel job)."""
+        job_id = self._get_job_id(request)
+        self.batch_jobs[job_id]["status"] = "canceled"
+        self._forced_job_status[job_id] = "canceled"
+        context.status_code = 204
+
     def _handle_get_job_result_asset(self, request, context):
         """Handler of `GET /job/{job_id}/results/result.data` (get batch job result asset)."""
         job_id = self._get_job_id(request)
@@ -261,18 +311,30 @@ def execute(self, cube: Union[DataCube, VectorCube], process_id: Optional[str] =
         cube.execute()
         return self.get_pg(process_id=process_id)
 
-    def setup_simple_job_status_flow(self, *, queued: int = 1, running: int = 4, final: str = "finished"):
+    def setup_simple_job_status_flow(
+        self,
+        *,
+        queued: int = 1,
+        running: int = 4,
+        final: str = "finished",
+        final_per_job: Optional[Mapping[str, str]] = None,
+    ):
         """
         Set up simple job status flow:
-        queued (a couple of times) -> running (a couple of times) -> finished/error.
+
+            queued (a couple of times) -> running (a couple of times) -> finished/error.
+
+        Final state can be specified generically with arg `final`
+        and, optionally, further fine-tuned per job with `final_per_job`.
         """
-        template = ["queued"] * queued + ["running"] * running + [final]
+        template = ["queued"] * queued + ["running"] * running
         job_stacks = collections.defaultdict(template.copy)
+        final_per_job = final_per_job or {}
 
         def get_status(job_id: str, current_status: str) -> str:
             stack = job_stacks[job_id]
-            # Pop first item each time, but repeat the last one at the end
-            return stack.pop(0) if len(stack) > 1 else stack[0]
+            # Pop first item each time, unless we're in final state
+            return stack.pop(0) if len(stack) > 0 else final_per_job.get(job_id, final)
 
         self.job_status_updater = get_status