Introduce JobDatabaseInterface.get_by_indices

soxofaan · soxofaan · commit 434854fc9e25 · 2025-09-09T13:55:16.000+02:00
- make sure `persist` gets complete rows - implement in both `FullDataFrameJobDatabase` and `STACAPIJobDatabase` refs: #719, #736, #793
diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
@@ -81,8 +81,10 @@ def exists(self) -> bool:
     @abc.abstractmethod
     def persist(self, df: pd.DataFrame):
         """
-        Store job data to the database.
-        The provided dataframe may contain partial information, which is merged into the larger database.
+        Store (now or updated) job data to the database.
+
+        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
+        so it should be merged with the existing data (if any) instead of overwriting it completely.
 
         :param df: job data to store.
         """
@@ -111,6 +113,17 @@ def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
         """
         ...
 
+    @abc.abstractmethod
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs based on their (dataframe) index
+
+        :param indices: List of indices to include.
+
+        :return: DataFrame with jobs filtered by indices.
+        """
+        ...
+
 
 def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
     raise NotImplementedError("No 'start_job' callable provided")
@@ -707,9 +720,9 @@ def _process_threadworker_updates(
         if not updates:
             return
 
-        # Build DataFrame of updates indexed by df_idx
-        df_updates = pd.DataFrame(updates).set_index("df_idx", drop=True)
-
+        # Build update DataFrame and persist
+        df_updates = job_db.get_by_indices(indices=set(u["df_idx"] for u in updates))
+        df_updates.update(pd.DataFrame(updates).set_index("df_idx", drop=True), overwrite=True)
         job_db.persist(df_updates)
         stats["job_db persist"] = stats.get("job_db persist", 0) + 1
 
@@ -968,10 +981,21 @@ def get_by_status(self, statuses, max=None) -> pd.DataFrame:
 
     def _merge_into_df(self, df: pd.DataFrame):
         if self._df is not None:
+            unknown_indices = set(df.index).difference(df.index)
+            if unknown_indices:
+                _log.warning(f"Merging DataFrame with {unknown_indices=} which will be lost.")
             self._df.update(df, overwrite=True)
         else:
             self._df = df
 
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        indices = set(indices)
+        known = indices.intersection(self.df.index)
+        unknown = indices.difference(self.df.index)
+        if unknown:
+            _log.warning(f"Ignoring unknown DataFrame indices {unknown}")
+        return self._df.loc[list(known)]
+
 
 class CsvJobDatabase(FullDataFrameJobDatabase):
     """
diff --git a/openeo/extra/job_management/stac_job_db.py b/openeo/extra/job_management/stac_job_db.py
@@ -1,7 +1,7 @@
 import concurrent.futures
 import datetime
 import logging
-from typing import Iterable, List, Optional
+from typing import Iterable, List, Optional, Union
 
 import geopandas as gpd
 import numpy as np
@@ -165,6 +165,12 @@ def count_by_status(self, statuses: Iterable[str] = ()) -> dict:
         else:
             return items["status"].value_counts().to_dict()
 
+    def _search_result_to_df(self, search_result: pystac_client.ItemSearch) -> pd.DataFrame:
+        """Build a DataFrame from a STAC ItemSearch result."""
+        series = [self.series_from(item) for item in search_result.items()]
+        df = pd.DataFrame(series).reset_index(names=["item_id"])
+        return df
+
     def get_by_status(self, statuses: Iterable[str], max: Optional[int] = None) -> pd.DataFrame:
         if isinstance(statuses, str):
             statuses = {statuses}
@@ -178,16 +184,24 @@ def get_by_status(self, statuses: Iterable[str], max: Optional[int] = None) -> p
             max_items=max,
         )
 
-        series = [self.series_from(item) for item in search_results.items()]
+        df = self._search_result_to_df(search_results)
 
-        df = pd.DataFrame(series).reset_index(names=["item_id"])
-        if len(series) == 0:
+        if df.shape[0] == 0:
             # TODO: What if default columns are overwritten by the user?
             df = self._normalize_df(
                 df
             )  # Even for an empty dataframe the default columns are required
         return df
 
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        search_results = self.client.search(
+            method="GET",
+            collections=[self.collection_id],
+            ids=[str(i) for i in indices],
+        )
+        df = self._search_result_to_df(search_results)
+        return df
+
     def persist(self, df: pd.DataFrame):
         if not self.exists():
             spatial_extent = pystac.SpatialExtent([[-180, -90, 180, 90]])
diff --git a/tests/extra/job_management/test_job_management.py b/tests/extra/job_management/test_job_management.py
@@ -24,6 +24,7 @@
 import numpy as np
 import pandas
 import pandas as pd
+import pandas.testing
 import pytest
 import requests
 import shapely.geometry
@@ -745,7 +746,6 @@ def get_status(job_id, current_status):
         filled_running_start_time = final_df.iloc[0]["running_start_time"]
         assert isinstance(rfc3339.parse_datetime(filled_running_start_time), datetime.datetime)
 
-
     def test_process_threadworker_updates(self, tmp_path, caplog):
         pool = _JobManagerWorkerThreadPool(max_workers=2)
         stats = collections.defaultdict(int)
@@ -755,8 +755,6 @@ def test_process_threadworker_updates(self, tmp_path, caplog):
         pool.submit_task(DummyTask("j-1", df_idx=1, db_update={"status": "queued"}, stats_update=None))
         pool.submit_task(DummyTask("j-2", df_idx=2, db_update=None, stats_update={"queued": 1}))
         pool.submit_task(DummyTask("j-3", df_idx=3, db_update=None, stats_update=None))
-        # Invalid index (not in DB)
-        pool.submit_task(DummyTask("j-missing", df_idx=4, db_update={"status": "created"}, stats_update=None))
 
         df_initial = pd.DataFrame(
             {
@@ -768,23 +766,62 @@ def test_process_threadworker_updates(self, tmp_path, caplog):
 
         mgr = MultiBackendJobManager(root_dir=tmp_path / "jobs")
 
-        with caplog.at_level(logging.ERROR):
-            mgr._process_threadworker_updates(worker_pool=pool, job_db=job_db, stats=stats)
+        mgr._process_threadworker_updates(worker_pool=pool, job_db=job_db, stats=stats)
 
         df_final = job_db.read()
+        pandas.testing.assert_frame_equal(
+            df_final[["id", "status"]],
+            pandas.DataFrame(
+                {
+                    "id": ["j-0", "j-1", "j-2", "j-3"],
+                    "status": ["queued", "queued", "created", "created"],
+                }
+            ),
+        )
+        assert stats == dirty_equals.IsPartialDict(
+            {
+                "queued": 2,
+                "job_db persist": 1,
+            }
+        )
+        assert caplog.messages == []
+
+    def test_process_threadworker_updates_unknown(self, tmp_path, caplog):
+        pool = _JobManagerWorkerThreadPool(max_workers=2)
+        stats = collections.defaultdict(int)
+
+        pool.submit_task(DummyTask("j-123", df_idx=0, db_update={"status": "queued"}, stats_update={"queued": 1}))
+        pool.submit_task(DummyTask("j-unknown", df_idx=4, db_update={"status": "created"}, stats_update=None))
+
+        df_initial = pd.DataFrame(
+            {
+                "id": ["j-123", "j-456"],
+                "status": ["created", "created"],
+            }
+        )
+        job_db = CsvJobDatabase(tmp_path / "jobs.csv").initialize_from_df(df_initial)
 
-        # Assert no rows were appended
-        assert len(df_final) == 4
+        mgr = MultiBackendJobManager(root_dir=tmp_path / "jobs")
 
-        # Assert updates
-        assert df_final.loc[0, "status"] == "queued"
-        assert df_final.loc[1, "status"] == "queued"
-        assert df_final.loc[2, "status"] == "created"
-        assert df_final.loc[3, "status"] == "created"
+        mgr._process_threadworker_updates(worker_pool=pool, job_db=job_db, stats=stats)
 
-        # Assert stats
-        assert stats.get("queued", 0) == 2
-        assert stats["job_db persist"] == 1
+        df_final = job_db.read()
+        pandas.testing.assert_frame_equal(
+            df_final[["id", "status"]],
+            pandas.DataFrame(
+                {
+                    "id": ["j-123", "j-456"],
+                    "status": ["queued", "created"],
+                }
+            ),
+        )
+        assert stats == dirty_equals.IsPartialDict(
+            {
+                "queued": 1,
+                "job_db persist": 1,
+            }
+        )
+        assert caplog.messages == [dirty_equals.IsStr(regex=".*Ignoring unknown.*indices.*4.*")]
 
     def test_no_results_leaves_db_and_stats_untouched(self, tmp_path, caplog):
         pool = _JobManagerWorkerThreadPool(max_workers=2)