Fixes for new refactor runs

markurtz · markurtz · commit 2fbf052f6682 · 2025-08-28T16:49:36.000-04:00
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
@@ -253,9 +253,12 @@ def _task_done(task):
                 pending_tasks.add(request_task)
                 request_task.add_done_callback(_task_done)
         except (asyncio.CancelledError, Exception) as err:
-            await self._cancel_remaining_requests(pending_tasks, all_requests_processed)
-            await self.messaging.stop()
-            await self.backend.process_shutdown()
+            if self.startup_completed:
+                await self._cancel_remaining_requests(
+                    pending_tasks, all_requests_processed
+                )
+                await self.messaging.stop()
+                await self.backend.process_shutdown()
 
             raise err
 
diff --git a/src/guidellm/scheduler/worker_group.py b/src/guidellm/scheduler/worker_group.py
@@ -144,7 +144,7 @@ def __init__(
 
     async def create_processes(self):
         """
-        Start the processes for the worker process group.
+        Create and initialize worker processes for distributed request processing.
 
         Sets up multiprocessing infrastructure and worker processes based on
         strategy constraints, backend capabilities, and system configuration.
@@ -399,11 +399,6 @@ class _WorkerGroupState(Generic[RequestT, MeasuredRequestTimingsT, ResponseT]):
     Handles request generation, state updates, constraint evaluation, and
     coordination between worker processes. Provides thread-safe state management
     with request lifecycle tracking and constraint-based termination logic.
-
-    :param start_time: Unix timestamp when processing should begin
-    :param num_processes: Number of worker processes in the group
-    :param constraints: Named constraints for controlling execution behavior
-    :param shutdown_event: Multiprocessing event for coordinated shutdown
     """
 
     def __init__(
@@ -414,6 +409,15 @@ def __init__(
         constraints: dict[str, Constraint],
         shutdown_event: Event,
     ):
+        """
+        Initialize worker group state management.
+
+        :param start_time: Unix timestamp when processing should begin
+        :param num_processes: Number of worker processes in the group
+        :param processes: List of worker process instances
+        :param constraints: Named constraints for controlling execution behavior
+        :param shutdown_event: Multiprocessing event for coordinated shutdown
+        """
         self._start_time = start_time
         self._update_lock: threading.Lock = threading.Lock()
         self._state: SchedulerState = SchedulerState(
@@ -527,7 +531,7 @@ def update_callback_receive(
         )
 
     def stop_callback_receive(
-        self, messaging: InterProcessMessaging, pending: bool, is_empty: bool
+        self, messaging: InterProcessMessaging, pending: bool, queue_empty: int
     ) -> bool:
         """
         Determine if message receiving should stop based on system state.
@@ -537,12 +541,12 @@ def stop_callback_receive(
 
         :param messaging: Inter-process messaging instance
         :param pending: Whether operations are still pending
-        :param is_empty: Whether receive queues are empty
+        :param queue_empty: The number of times the queue has reported empty in a row
         :return: True if message receiving should stop, False otherwise
         """
         return (
             not pending
-            and is_empty  # all updates pulled off
+            and queue_empty >= InterProcessMessaging.STOP_REQUIRED_QUEUE_EMPTY
             and messaging.send_stopped_event.is_set()  # No more requests will be added
             and self._shutdown_event.is_set()  # processing should stop
             and all(
diff --git a/src/guidellm/utils/messaging.py b/src/guidellm/utils/messaging.py
@@ -400,8 +400,10 @@ def check_stop(pending: bool, queue_empty: int) -> bool:
             return (
                 not pending
                 and queue_empty >= self.STOP_REQUIRED_QUEUE_EMPTY
-                and self.shutdown_event.is_set()
-                or any(event.is_set() for event in stop_events)
+                and (
+                    self.shutdown_event.is_set()
+                    or any(event.is_set() for event in stop_events)
+                )
             )
 
         return check_stop
diff --git a/tests/unit/scheduler/test_worker_group.py b/tests/unit/scheduler/test_worker_group.py
@@ -87,7 +87,7 @@ class TestWorkerProcessGroup:
                 "requests": None,
                 "cycle_requests": ["request1", "request2", "request3"],
                 "strategy": SynchronousStrategy(),
-                "constraints": {"max_requests": MaxNumberConstraint(max_num=10)},
+                "constraints": {"max_num": MaxNumberConstraint(max_num=10)},
             },
             {
                 "requests": None,
@@ -185,33 +185,137 @@ def test_initialization(self, valid_instances):
         assert instance._state is None
         assert instance.messaging is None
 
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        ("requests", "cycle_requests", "expected_error"),
+        [
+            (None, None, ValueError),
+            ([], iter([]), ValueError),  # cycle_requests as Iterator
+            (None, iter(["req1"]), ValueError),  # cycle_requests as Iterator
+        ],
+        ids=["no_requests", "cycle_as_iterator_empty", "cycle_as_iterator_data"],
+    )
+    def test_invalid_initialization_values(
+        self, requests, cycle_requests, expected_error
+    ):
+        """Test WorkerProcessGroup with invalid initialization values."""
+        with pytest.raises(expected_error):
+            WorkerProcessGroup(
+                requests=requests,
+                cycle_requests=cycle_requests,
+                backend=MockBackend(),
+                strategy=SynchronousStrategy(),
+                constraints={},
+            )
+
+    @pytest.mark.sanity
+    def test_invalid_initialization_missing(self):
+        """Test WorkerProcessGroup initialization without required fields."""
+        with pytest.raises(TypeError):
+            WorkerProcessGroup()
+
     @pytest.mark.smoke
-    # @async_timeout(5)
+    @async_timeout(10)
     @pytest.mark.asyncio
     async def test_lifecycle(self, valid_instances: tuple[WorkerProcessGroup, dict]):
         """Test the lifecycle methods of WorkerProcessGroup."""
-        instance, _ = valid_instances
+        instance, constructor_args = valid_instances
 
         # Test create processes
         await instance.create_processes()
-        # TODO: check valid process creation
+
+        # Check valid process creation
+        assert instance.mp_context is not None
+        assert instance.mp_manager is not None
+        assert instance.processes is not None
+        assert len(instance.processes) > 0
+        assert all(proc.is_alive() for proc in instance.processes)
+        assert instance.startup_barrier is not None
+        assert instance.shutdown_event is not None
+        assert instance.error_event is not None
+        assert instance.requests_completed_event is not None
+        assert instance.messaging is not None
 
         # Test start
         start_time = time.time() + 0.1
         await instance.start(start_time=start_time)
-        # TODO: check valid start behavior
+
+        # Check valid start behavior
+        assert instance.messaging is not None
+        assert instance._state is not None
+        assert instance._state._start_time == start_time
+        assert instance._state._state.num_processes == len(instance.processes)
+        assert not instance.error_event.is_set()
 
         # Test iter updates
-        updates = {}
-        async for resp, req, info, state in instance.request_updates():
-            pass
-        # TODO: validate correct updates based on requests, cycle_requests, and constraints
+        updates_list = []
+        responses_count = 0
+
+        async for (
+            response,
+            request,
+            request_info,
+            scheduler_state,
+        ) in instance.request_updates():
+            updates_list.append((response, request, request_info, scheduler_state))
+            if response is not None:
+                responses_count += 1
+
+            # Validate request info structure
+            assert hasattr(request_info, "request_id")
+            assert hasattr(request_info, "status")
+            valid_statuses = [
+                "queued",
+                "in_progress",
+                "completed",
+                "errored",
+                "cancelled",
+            ]
+            assert request_info.status in valid_statuses
+
+            # Validate state structure
+            assert hasattr(scheduler_state, "created_requests")
+            assert hasattr(scheduler_state, "processed_requests")
+            assert hasattr(scheduler_state, "successful_requests")
+            assert scheduler_state.created_requests >= 0
+            assert scheduler_state.processed_requests >= 0
+            assert scheduler_state.successful_requests >= 0
+
+        # Validate correctness of all updates
+        if constructor_args.get("requests") is not None:
+            assert len(updates_list) == 2 * len(constructor_args["requests"]), (
+                "Should have received updates for all requests"
+            )
+        if constructor_args.get("constraints", {}).get("max_num") is not None:
+            assert (
+                len(updates_list)
+                == 2 * constructor_args["constraints"]["max_num"].max_num
+            ), "Should not have received more updates than max_num constraint"
+
+        assert len(updates_list) > 0, "Should have received at least one update"
+
+        # Constraints should be satisfied
+        for constraint_name, _ in constructor_args["constraints"].items():
+            constraint_check = (
+                "max" in constraint_name.lower()
+                or "duration" in constraint_name.lower()
+            )
+            if constraint_check:
+                assert scheduler_state.end_processing_time is not None, (
+                    f"Should have stopped processing due to {constraint_name}"
+                )
 
         # Test shutdown
-        await instance.shutdown()
-        print(
-            f"\nRequests summary: created={state.created_requests}, queued={state.queued_requests}, processing={state.processing_requests}, processed={state.processed_requests}, successful={state.successful_requests}, cancelled={state.cancelled_requests}, errored={state.errored_requests}"
+        exceptions = await instance.shutdown()
+
+        # Check valid shutdown behavior
+        assert isinstance(exceptions, list), "Shutdown should return list of exceptions"
+        assert instance.messaging is None, "Messaging should be cleared after shutdown"
+        assert instance._state is None, "State should be cleared after shutdown"
+        assert instance.processes is None, "Processes should be cleared after shutdown"
+        assert instance.mp_manager is None, (
+            "MP manager should be cleared after shutdown"
+        )
+        assert instance.mp_context is None, (
+            "MP context should be cleared after shutdown"
         )
-        print(resp)
-        print(info)
-        # TODO: check valid shutdown behavior