Scale in blocks at shutdown using Job Status Poller

benclifford · benclifford · commit 011e07454fb6 · 2024-03-26T10:43:47.000Z
This will now scale in blocks using the job status poller scale in code, which means the DFK does not need to send its own BLOCK_INFO monitoring messages. minimalish change of which blocks get scaled in at shutdown - to come from the jobstatuspoller list: that will get pending blocks scaled in at shutdown, I think, but will now push the dynamically updated list to the cac hed-side of the cache poll... what does that change? we will now be delayed in s eeing ended jobs, but the executor.status data is already out of date in that se nse the moment the call returns (but *less* out of date) this patch is deliberately minimalist in that it does not attempt to move the scale down code - this is a PR about changing behaviour, not about rewriting the scale down strategy more seriously. the behaviour change is to move towards treating the jobstatuspoller pollitem status as the source of best-estimated truth. other work should probably do that moving, to complement the recent init_blocks handling PR #3283
diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
@@ -34,7 +34,6 @@
 from parsl.dataflow.taskrecord import TaskRecord
 from parsl.errors import ConfigurationError, InternalConsistencyError, NoDataFlowKernelError
 from parsl.jobs.job_status_poller import JobStatusPoller
-from parsl.jobs.states import JobStatus, JobState
 from parsl.usage_tracking.usage import UsageTracker
 from parsl.executors.base import ParslExecutor
 from parsl.executors.status_handling import BlockProviderExecutor
@@ -1216,22 +1215,21 @@ def cleanup(self) -> None:
 
         logger.info("Scaling in and shutting down executors")
 
+        for pi in self.job_status_poller._poll_items:
+            if not pi.executor.bad_state_is_set:
+                logger.info(f"Scaling in executor {pi.executor.label}")
+
+                # this code needs to be at least as many blocks as need
+                # cancelling, but it is safe to be more, as the scaling
+                # code will cope with being asked to cancel more blocks
+                # than exist.
+                block_count = len(pi.status)
+                pi.scale_in(block_count)
+
+            else:  # and bad_state_is_set
+                logger.warning(f"Not scaling in executor {pi.executor.label} because it is in bad state")
+
         for executor in self.executors.values():
-            if isinstance(executor, BlockProviderExecutor):
-                if not executor.bad_state_is_set:
-                    logger.info(f"Scaling in executor {executor.label}")
-                    if executor.provider:
-                        job_ids = executor.provider.resources.keys()
-                        block_ids = executor.scale_in(len(job_ids))
-                        if self.monitoring and block_ids:
-                            new_status = {}
-                            for bid in block_ids:
-                                new_status[bid] = JobStatus(JobState.CANCELLED)
-                            msg = executor.create_monitoring_info(new_status)
-                            logger.debug("Sending message {} to hub from DFK".format(msg))
-                            self.monitoring.send(MessageType.BLOCK_INFO, msg)
-                else:  # and bad_state_is_set
-                    logger.warning(f"Not scaling in executor {executor.label} because it is in bad state")
             logger.info(f"Shutting down executor {executor.label}")
             executor.shutdown()
             logger.info(f"Shut down executor {executor.label}")