RolnickLab · vanessavmac · Mar 23, 2025 · Apr 3, 2025 · Apr 4, 2025 · Apr 5, 2025
diff --git a/ami/jobs/admin.py b/ami/jobs/admin.py
@@ -66,3 +66,11 @@ class MLTaskRecordAdmin(AdminBase):
         "task_name",
         "status",
     )
+
+    @admin.action()
+    def kill_task(self, request: HttpRequest, queryset: QuerySet[MLTaskRecord]) -> None:
+        for ml_task_record in queryset:
+            ml_task_record.kill_task()
+        self.message_user(request, f"Killed {queryset.count()} ML task(s).")
+
+    actions = [kill_task]
diff --git a/ami/jobs/migrations/0023_alter_job_last_checked_alter_mltaskrecord_status.py b/ami/jobs/migrations/0023_alter_job_last_checked_alter_mltaskrecord_status.py
@@ -0,0 +1,33 @@
+# Generated by Django 4.2.10 on 2025-11-04 11:44
+
+import datetime
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("jobs", "0022_job_last_checked"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="job",
+            name="last_checked",
+            field=models.DateTimeField(blank=True, default=datetime.datetime.now, null=True),
+        ),
+        migrations.AlterField(
+            model_name="mltaskrecord",
+            name="status",
+            field=models.CharField(
+                choices=[
+                    ("PENDING", "PENDING"),
+                    ("STARTED", "STARTED"),
+                    ("SUCCESS", "SUCCESS"),
+                    ("FAIL", "FAIL"),
+                    ("REVOKED", "REVOKED"),
+                ],
+                default="STARTED",
+                max_length=255,
+            ),
+        ),
+    ]
diff --git a/ami/jobs/models.py b/ami/jobs/models.py
@@ -396,9 +396,9 @@ def check_inprogress_subtasks(cls, job: "Job") -> bool:
                     inprogress_subtask.task_id = save_results_task.id
                     task_id = save_results_task.id
                     inprogress_subtask.save()
-                    job.logger.info(f"Started save results task {inprogress_subtask.task_id}")
+                    job.logger.debug(f"Started save results task {inprogress_subtask.task_id}")
                 else:
-                    job.logger.info("A save results task is already in progress, will not start another one yet.")
+                    job.logger.debug("A save results task is already in progress, will not start another one yet.")
                     continue
 
             task = AsyncResult(task_id)
@@ -407,12 +407,12 @@ def check_inprogress_subtasks(cls, job: "Job") -> bool:
                 inprogress_subtask.status = (
                     MLSubtaskState.SUCCESS.name if task.successful() else MLSubtaskState.FAIL.name
                 )
-                inprogress_subtask.raw_traceback = task.traceback
 
                 if task.traceback:
-                    # TODO: Error logs will have many tracebacks
-                    # could add some processing to provide a concise error summary
                     job.logger.error(f"Subtask {task_name} ({task_id}) failed: {task.traceback}")
+                    inprogress_subtask.status = MLSubtaskState.FAIL.name
 if len(inprogress_subtasks_to_update) >= 10: 
     MLTaskRecord.objects.bulk_update( 
         inprogress_subtasks_to_update, 
         [ 
             "status", 
             "raw_traceback", 
             "raw_results", 
             "num_captures", 
             "num_detections", 
             "num_classifications", 
         ], 
     ) 
 if len(inprogress_subtasks_to_update) >= 10: 
     MLTaskRecord.objects.bulk_update( 
         inprogress_subtasks_to_update, 
         [ 
             "status", 
             "raw_traceback", 
             "raw_results", 
             "num_captures", 
             "num_detections", 
             "num_classifications", 
         ], 
     ) 
+                    inprogress_subtask.raw_traceback = task.traceback
+                    continue
 
                 results_dict = task.result
                 if task_name == MLSubtaskNames.process_pipeline_request.name:
@@ -505,7 +505,7 @@ def check_inprogress_subtasks(cls, job: "Job") -> bool:
                 f"{inprogress_subtasks.count()} inprogress subtasks remaining out of {total_subtasks} total subtasks."
             )
             inprogress_task_ids = [task.task_id for task in inprogress_subtasks]
-            job.logger.info(f"Subtask ids: {inprogress_task_ids}")  # TODO: remove this? not very useful to the user
+            job.logger.debug(f"Subtask ids: {inprogress_task_ids}")
             return False
         else:
             job.logger.info("No inprogress subtasks left.")
@@ -999,6 +999,7 @@ class MLSubtaskState(str, OrderedEnum):
     STARTED = "STARTED"
     SUCCESS = "SUCCESS"
     FAIL = "FAIL"
+    REVOKED = "REVOKED"
 
 
 class MLTaskRecord(BaseModel):
@@ -1041,6 +1042,17 @@ def clean(self):
         if self.status == MLSubtaskState.PENDING.name and self.task_name != MLSubtaskNames.save_results.name:
             raise ValueError(f"{self.task_name} tasks cannot have a PENDING status.")
 
+    def kill_task(self):
+        """
+        Kill the celery task associated with this MLTaskRecord.
+        """
+        from config.celery_app import app as celery_app
+
+        if self.task_id:
+            celery_app.control.revoke(self.task_id, terminate=True, signal="SIGTERM")
+            self.status = MLSubtaskState.REVOKED.name
+            self.save(update_fields=["status"])
+
 
 class Job(BaseModel):
     """A job to be run by the scheduler"""
@@ -1050,7 +1062,7 @@ class Job(BaseModel):
 
     name = models.CharField(max_length=255)
     queue = models.CharField(max_length=255, default="default")
-    last_checked = models.DateTimeField(null=True, blank=True)
+    last_checked = models.DateTimeField(null=True, blank=True, default=datetime.datetime.now)
     scheduled_at = models.DateTimeField(null=True, blank=True)
     started_at = models.DateTimeField(null=True, blank=True)
     finished_at = models.DateTimeField(null=True, blank=True)

diff --git a/ami/jobs/tests.py b/ami/jobs/tests.py
@@ -1,4 +1,5 @@
 # from rich import print
+import datetime
 import logging
 import time
 
@@ -406,4 +407,94 @@ def get_ml_job_subtask_details(task_name, job):
         self.assertEqual(job.status, JobState.SUCCESS.value)
         self.assertEqual(job.progress.summary.progress, 1)
         self.assertEqual(job.progress.summary.status, JobState.SUCCESS)
-        job.save()
+
+
+class TestStaleMLJob(TransactionTestCase):
+    def setUp(self):
+        self.project = Project.objects.first()  # get the original test project
+        assert self.project
+        self.source_image_collection = self.project.sourceimage_collections.get(name="Test Source Image Collection")
+        self.pipeline = Pipeline.objects.get(slug="constant")
+
+        # remove existing detections from the source image collection
+        for image in self.source_image_collection.images.all():
+            image.detections.all().delete()
+            image.save()
+
+    def test_kill_dangling_ml_job(self):
+        """Test killing a dangling ML job."""
+        from ami.ml.tasks import check_dangling_ml_jobs
+        from config import celery_app
+
+        job = Job.objects.create(
+            job_type_key=MLJob.key,
+            project=self.project,
+            name="Test dangling job",
+            delay=0,
+            pipeline=self.pipeline,
+            source_image_collection=self.source_image_collection,
+        )
+
+        job.run()
+        connection.commit()
+        job.refresh_from_db()
+
+        # Simulate last_checked being older than 5 minutes
+        job.last_checked = datetime.datetime.now() - datetime.timedelta(minutes=10)
+        job.save(update_fields=["last_checked"])
+
+        # Run the dangling job checker
+        check_dangling_ml_jobs()
+
+        # Refresh job from DB
+        job.refresh_from_db()
+
+        # Make sure no tasks are still in progress
+        for ml_task_record in job.ml_task_records.all():
+            self.assertEqual(ml_task_record.status, MLSubtaskState.REVOKED.value)
+
+            # Also check celery queue to make sure all tasks have been revoked
+            task_id = ml_task_record.task_id
+
+            inspector = celery_app.control.inspect()
+            active = inspector.active() or {}
+            reserved = inspector.reserved() or {}
+
+            not_running = all(
+                task_id not in [t["id"] for w in active.values() for t in w] for w in active.values()
+            ) and all(task_id not in [t["id"] for w in reserved.values() for t in w] for w in reserved.values())
-            not_running = all(
-                task_id not in [t["id"] for w in active.values() for t in w] for w in active.values()
-            ) and all(task_id not in [t["id"] for w in reserved.values() for t in w] for w in reserved.values())
+            active_task_ids = [t["id"] for tasks in active.values() for t in tasks]
+            reserved_task_ids = [t["id"] for tasks in reserved.values() for t in tasks]
+            not_running = task_id not in active_task_ids and task_id not in reserved_task_ids
-            not_running = all(
-                task_id not in [t["id"] for w in active.values() for t in w] for w in active.values()
-            ) and all(task_id not in [t["id"] for w in reserved.values() for t in w] for w in reserved.values())
+            active_task_ids = [t["id"] for tasks in active.values() for t in tasks]
+            reserved_task_ids = [t["id"] for tasks in reserved.values() for t in tasks]
+            not_running = task_id not in active_task_ids and task_id not in reserved_task_ids
+
+            self.assertTrue(not_running)
+
+        self.assertEqual(job.status, JobState.REVOKED.value)
+
+    def test_kill_task_prevents_execution(self):
+        from ami.jobs.models import Job, MLSubtaskNames, MLTaskRecord
+        from ami.ml.models.pipeline import process_pipeline_request
+        from config import celery_app
+
+        logger.info("Testing that killing a task prevents its execution.")
+        result = process_pipeline_request.apply_async(args=[{}, 1], countdown=5)
+        logger.info(f"Scheduled task with id {result.id} to run in 5 seconds.")
+        task_id = result.id
+
+        job = Job.objects.create(
+            job_type_key=MLJob.key,
+            project=self.project,
+            name="Test killing job tasks",
+            delay=0,
+            pipeline=self.pipeline,
+            source_image_collection=self.source_image_collection,
+        )
+
+        ml_task_record = MLTaskRecord.objects.create(
+            job=job, task_name=MLSubtaskNames.process_pipeline_request.value, task_id=task_id
+        )
+        logger.info(f"Killing task {task_id} immediately.")
+        ml_task_record.kill_task()
+
+        async_result = celery_app.AsyncResult(task_id)
+        time.sleep(5)  # the REVOKED STATUS isn't visible until the task is actually run after the delay
+
+        self.assertIn(async_result.state, ["REVOKED"])
+        self.assertEqual(ml_task_record.status, "REVOKED")
diff --git a/ami/ml/models/pipeline.py b/ami/ml/models/pipeline.py
@@ -15,7 +15,6 @@
 from urllib.parse import urljoin
 
 import requests
-from celery.result import AsyncResult
 from django.db import models, transaction
 from django.utils.text import slugify
 from django.utils.timezone import now
@@ -355,7 +354,7 @@ def handle_async_process_images(
             )
             ml_task_record.source_images.set(source_image_batches[idx])
             ml_task_record.save()
-            task_logger.info(f"Created MLTaskRecord {ml_task_record} for task {task_id}")
+            task_logger.debug(f"Created MLTaskRecord {ml_task_record} for task {task_id}")
         else:
             task_logger.warning("No job ID provided, MLTaskRecord will not be created.")
 
@@ -1272,7 +1271,6 @@ def save_results(self, results: PipelineResultsResponse, job_id: int | None = No
     def save_results_async(self, results: PipelineResultsResponse, job_id: int | None = None):
         # Returns an AsyncResult
         results_json = results.json()
-        logger.info("Submitting save results task...")
         return save_results.delay(results_json=results_json, job_id=job_id)
 
     def save(self, *args, **kwargs):
@@ -1282,79 +1280,3 @@ def save(self, *args, **kwargs):
             unique_suffix = str(uuid.uuid4())[:8]
             self.slug = f"{slugify(self.name)}-v{self.version}-{unique_suffix}"
         return super().save(*args, **kwargs)
-
-    def watch_single_batch_task(
-        self,
-        task_id: str,
-        task_logger: logging.Logger | None = None,
-    ) -> PipelineResultsResponse | None:
-        """
-        Helper function to watch a single batch process task and return the result.
-        """
-        task_logger = task_logger or logger
-
-        result = AsyncResult(task_id)
-        if result.ready():
-            task_logger.info(f"Task {task_id} completed with status: {result.status}")
-            if result.successful():
-                task_logger.info(f"Task {task_id} completed successfully with result: {result.result}")
-                task_logger.warning(f"Task {task_id} result: {result.result}")
-                return PipelineResultsResponse(**result.result)
-            else:
-                task_logger.error(f"Task {task_id} failed with result: {result.result}")
-                return PipelineResultsResponse(
-                    pipeline="",
-                    algorithms={},
-                    total_time=0.0,
-                    source_images=[],
-                    detections=[],
-                    errors=f"Task {task_id} failed with result: {result.result}",
-                )
-        else:
-            task_logger.warning(f"Task {task_id} is not ready yet.")
-            return None
-
-    def watch_batch_tasks(
-        self,
-        task_ids: list[str],
-        timeout: int = 300,
-        poll_interval: int = 5,
-        task_logger: logging.Logger | None = None,
-    ) -> PipelineResultsResponse:
-        """
-        Helper function to watch batch process tasks and aggregate results into a single PipelineResultsResponse.
-
-        @TODO: this is only used by the test_process view, keep this as just a useful helper
-        function for that view? or can we somehow use it in the ML job too?
-        """
-        task_logger = task_logger or logger
-        start_time = time.time()
-        remaining = set(task_ids)
-
-        results = None
-        while remaining and (time.time() - start_time) < timeout:
-            for task_id in list(remaining):
-                result = self.watch_single_batch_task(task_id, task_logger=task_logger)
-                if result is not None:
-                    if not results:
-                        results = result
-                    else:
-                        results.combine_with([result])
-                    remaining.remove(task_id)
-            time.sleep(poll_interval)
-
-        if remaining and logger:
-            logger.error(f"Timeout reached. The following tasks didn't finish: {remaining}")
-
-        if results:
-            results.total_time = time.time() - start_time
-            return results
-        else:
-            return PipelineResultsResponse(
-                pipeline="",
-                algorithms={},
-                total_time=0.0,
-                source_images=[],
-                detections=[],
-                errors="No tasks completed successfully.",
-            )
diff --git a/ami/ml/tasks.py b/ami/ml/tasks.py
@@ -141,6 +141,11 @@ def check_ml_job_status(ml_job_id: int):
         job.update_status(JobState.FAILURE)
         job.finished_at = datetime.datetime.now()
         job.save()
+
+        # Remove remaining tasks from the queue
+        for ml_task_record in job.ml_task_records.all():
+            ml_task_record.kill_task()
+
         raise Exception(error_msg)
 
 
@@ -171,5 +176,8 @@ def check_dangling_ml_jobs():
             job.update_status(JobState.REVOKED)
             job.finished_at = datetime.datetime.now()
             job.save()
+
+            for ml_task_record in job.ml_task_records.all():
+                ml_task_record.kill_task()
         else:
             logger.info(f"Job {job.pk} is active. Last checked at {last_checked}.")