influence the job_database_directly

HansVRP · HansVRP · commit 441387c68017 · 2024-12-03T15:01:59.000+01:00
diff --git a/openeo/extra/job_management.py b/openeo/extra/job_management.py
@@ -665,28 +665,56 @@ def on_job_cancel(self, job: BatchJob, row):
         """
         pass
 
-    def _cancel_prolonged_job(self, job: BatchJob, row):
+    def _cancel_prolonged_job(self, job: BatchJob, row, df):
         """Cancel the job if it has been running for too long."""
         try:
-            running_start_time_str = row.get("running_start_time")
-            if not running_start_time_str or pd.isna(running_start_time_str):
-                _log.warning(f"Job {job.job_id} does not have a valid running start time. Cancellation skipped.")
-                return
+            # Ensure running start time is valid
+            running_start_time = self._ensure_running_start_time(job, row, df)
 
-            job_running_start_time = rfc3339.parse_datetime(running_start_time_str, with_timezone=True)
-            elapsed = datetime.datetime.now(tz=datetime.timezone.utc) - job_running_start_time
+            # Get the current time in RFC 3339 format (timezone-aware)
+            current_time_rfc3339 = rfc3339.utcnow()
+
+            # Parse the current time into a datetime object with timezone info
+            current_time = rfc3339.parse_datetime(current_time_rfc3339, with_timezone=True)
+
+            # Calculate the elapsed time between job start and now
+            elapsed = current_time - running_start_time
 
             if elapsed > self._cancel_running_job_after:
                 try:
                     _log.info(
-                        f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {job_running_start_time})"
+                        f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {running_start_time})"
                     )
                     job.stop()
                 except OpenEoApiError as e:
                     _log.error(f"Failed to cancel long-running job {job.job_id}: {e}")
         except Exception as e:
             _log.error(f"Unexpected error while handling job {job.job_id}: {e}")
 
+    def _ensure_running_start_time(self, job: BatchJob, row, df) -> datetime.datetime:
+        """
+        Ensures the running start time is valid. If missing, approximates with the current time.
+        Returns the parsed running start time as a datetime object.
+        """
+        running_start_time_str = row.get("running_start_time")
+
+        if not running_start_time_str or pd.isna(running_start_time_str):
+            _log.warning(
+                f"Job {job.job_id} does not have a valid running start time. Setting the current time as an approximation."
+            )
+            # Generate the current time in RFC 3339 format
+            current_time_rfc3339 = rfc3339.utcnow()
+
+            # Update the DataFrame safely using `.loc`
+            df.loc[df.index[row.name], "running_start_time"] = current_time_rfc3339
+
+            # Parse and return the datetime object with UTC timezone
+            return rfc3339.parse_datetime(current_time_rfc3339, with_timezone=True)
+
+        # Parse the existing time string and return it
+        return rfc3339.parse_datetime(running_start_time_str, with_timezone=True)
+
+
     def get_job_dir(self, job_id: str) -> Path:
         """Path to directory where job metadata, results and error logs are be saved."""
         return self._root_dir / f"job_{job_id}"
@@ -746,7 +774,7 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] =
                     self.on_job_cancel(the_job, active.loc[i])
 
                 if self._cancel_running_job_after and new_status == "running":
-                    self._cancel_prolonged_job(the_job, active.loc[i])
+                    self._cancel_prolonged_job(the_job, active.loc[i], active)
 
                 active.loc[i, "status"] = new_status
 
diff --git a/tests/extra/test_job_management.py b/tests/extra/test_job_management.py
@@ -7,6 +7,7 @@
 from time import sleep
 from typing import Callable, Union
 from unittest import mock
+import datetime
 
 import dirty_equals
 import geopandas
@@ -554,6 +555,7 @@ def start_job(row, connection_provider, connection, **kwargs):
                 12 * 60 * 60,
                 "finished",
             ),
+
         ],
     )
     def test_automatic_cancel_of_too_long_running_jobs(
@@ -645,6 +647,83 @@ def test_status_logging(self, tmp_path, job_manager, job_manager_root_dir, sleep
         assert needle.search(caplog.text)
 
 
+    @pytest.mark.parametrize(
+    ["create_time", "start_time", "running_start_time", "end_time", "end_status", "cancel_after_seconds", "expected_status"],
+    [
+        # Scenario 1: Missing running_start_time (None)
+        (
+            "2024-09-01T09:00:00Z",  # Job creation time
+            "2024-09-01T09:00:00Z",  # Job start time (should be 1 hour after create_time)
+            None,                     # Missing running_start_time
+            "2024-09-01T20:00:00Z",  # Job end time
+            "finished",               # Job final status
+            6 * 60 * 60,              # Cancel after 6 hours
+            "finished",               # Expected final status
+        ),
+        # Scenario 2: NaN running_start_time
+        (
+            "2024-09-01T09:00:00Z",
+            "2024-09-01T09:00:00Z",
+            float("nan"),             # NaN running_start_time
+            "2024-09-01T20:00:00Z",  # Job end time
+            "finished",               # Job final status
+            6 * 60 * 60,              # Cancel after 6 hours
+            "finished",               # Expected final status
+        ),
+    ]
+    )
+    def test_ensure_running_start_time_is_datetime(
+        self,
+        tmp_path,
+        time_machine,
+        create_time,
+        start_time,
+        running_start_time,
+        end_time,
+        end_status,
+        cancel_after_seconds,
+        expected_status,
+        dummy_backend_foo,
+        job_manager_root_dir,
+    ):
+        def get_status(job_id, current_status):
+            if rfc3339.utcnow() < start_time:
+                return "queued"
+            elif rfc3339.utcnow() < end_time:
+                return "running"
+            return end_status
+
+        # Set the job status updater function for the mock backend
+        dummy_backend_foo.job_status_updater = get_status
+
+        job_manager = MultiBackendJobManager(
+            root_dir=job_manager_root_dir, cancel_running_job_after=cancel_after_seconds
+        )
+        job_manager.add_backend("foo", connection=dummy_backend_foo.connection)
+
+        # Create a DataFrame representing the job database
+        df = pd.DataFrame({
+            "year": [2024],
+            "running_start_time": [running_start_time],  # Initial running_start_time
+        })
+
+        # Move the time machine to the job creation time
+        time_machine.move_to(create_time)
+
+        job_db_path = tmp_path / "jobs.csv"
+
+        # Mock sleep() to skip one hour at a time instead of actually sleeping
+        with mock.patch.object(openeo.extra.job_management.time, "sleep", new=lambda s: time_machine.shift(60 * 60)):
+            job_manager.run_jobs(df=df, start_job=self._create_year_job, job_db=job_db_path)
+
+        final_df = CsvJobDatabase(job_db_path).read()
+
+        # Validate running_start_time is a valid datetime object
+        filled_running_start_time = final_df.iloc[0]["running_start_time"]
+        assert isinstance(rfc3339.parse_datetime(filled_running_start_time), datetime.datetime)
+
+
+
 JOB_DB_DF_BASICS = pd.DataFrame(
     {
         "numbers": [3, 2, 1],