resolve comments

DNXie · DNXie · commit c783a80fe927 · 2025-09-22T17:04:24.000-07:00
diff --git a/src/forge/controller/service/router.py b/src/forge/controller/service/router.py
@@ -91,6 +91,7 @@ async def get_replica(
         )
         return replica
 
+
 class BatchRouter(Router):
     """
     Router wrapper that batches routing decisions.
@@ -121,6 +122,7 @@ def __init__(
 
         # Internal queue for batching routing requests
         self._queue: asyncio.Queue = asyncio.Queue()
+        self._running = True  # flag to control loop
         # Background task that processes batches continuously
         self._batch_task: asyncio.Task = asyncio.create_task(self._batch_loop())
 
@@ -139,27 +141,31 @@ async def _batch_loop(self):
 
         This process repeats indefinitely until the task is cancelled.
         """
-        while True:
+        while self._running:
             batch = []
             futs = []
             sess_ids = []
-            start_time = time.time()
 
             # Wait for first request
             fut, healthy_replicas, sess_id, session_map = await self._queue.get()
             batch.append((healthy_replicas, sess_id, session_map))
             futs.append(fut)
             sess_ids.append(sess_id)
+            start_time = time.monotonic()
 
             while True:
                 try:
-                    timeout = max(0, self.batch_max_wait_s - (time.time() - start_time))
+                    timeout = max(
+                        0, self.batch_max_wait_s - (time.monotonic() - start_time)
+                    )
                     (
                         fut,
                         healthy_replicas,
                         sess_id,
                         session_map,
-                    ) = await asyncio.wait_for(self._queue.get(), timeout)
+                    ) = await asyncio.wait_for(
+                        self._queue.get(), timeout
+                    )  # wait for timeout or until self._queue.get() finishes
                     batch.append((healthy_replicas, sess_id, session_map))
                     futs.append(fut)
                     sess_ids.append(sess_id)
@@ -190,11 +196,18 @@ async def get_replica(
         session_map: Optional[Dict[str, int]] = None,
     ) -> Replica:
         """Enqueue request and wait until batch assigns a replica."""
-        loop = asyncio.get_event_loop()
-        fut = loop.create_future()
-
+        fut = asyncio.Future()
         # Queue the request for batching - this is non-blocking
         self._queue.put_nowait((fut, healthy_replicas, sess_id, session_map))
 
         # Wait for the batch processor to resolve our future
-        return await fut
+        return await fut
+
+    async def shutdown(self):
+        """Stop the batch loop gracefully."""
+        self._running = False
+        self._batch_task.cancel()
+        try:
+            await self._batch_task
+        except asyncio.CancelledError:
+            pass
diff --git a/tests/unit_tests/test_service.py b/tests/unit_tests/test_service.py
@@ -686,6 +686,7 @@ async def test_session_router_with_round_robin_fallback():
 
     assert r1.idx == r2.idx == 0
 
+
 @pytest.mark.asyncio
 async def test_batching_router_batchsize_with_roundrobin():
     """Batch should flush when max batch size is reached using RoundRobinRouter."""
@@ -698,21 +699,25 @@ async def test_batching_router_batchsize_with_roundrobin():
         batch_max_wait_s=0.5,  # long enough to not trigger timeout
     )
 
-    # Enqueue `batch_size + 1` requests to force batch flush
-    tasks = [
-        asyncio.create_task(router.get_replica(replicas)) for _ in range(batch_size + 1)
-    ]
-    results = await asyncio.gather(*tasks)
+    try:
+        # Enqueue `batch_size + 1` requests to force batch flush
+        tasks = [
+            asyncio.create_task(router.get_replica(replicas))
+            for _ in range(batch_size + 1)
+        ]
+        results = await asyncio.gather(*tasks)
 
-    # Check all results are healthy replicas
-    assert all(r.state == ReplicaState.HEALTHY for r in results)
+        # Check all results are healthy replicas
+        assert all(r.state == ReplicaState.HEALTHY for r in results)
 
-    # Check results only use existing replica indices
-    indices = {r.idx for r in results}
-    assert indices.issubset({0, 1})
+        # Check results only use existing replica indices
+        indices = {r.idx for r in results}
+        assert indices.issubset({0, 1})
 
-    # Ensure batch queue is empty after flush
-    assert router._queue.qsize() == 0
+        # Ensure batch queue is empty after flush
+        assert router._queue.qsize() == 0
+    finally:
+        router.shutdown()
 
 
 @pytest.mark.asyncio
@@ -725,19 +730,21 @@ async def test_batching_router_skips_unhealthy_replicas():
         batch_max_size=4,
         batch_max_wait_s=0.5,
     )
+    try:
+        # Start two requests that will form a batch
+        tasks = [asyncio.create_task(router.get_replica(replicas)) for _ in range(2)]
 
-    # Start two requests that will form a batch
-    tasks = [asyncio.create_task(router.get_replica(replicas)) for _ in range(2)]
-
-    # While they are waiting, mark replica 0 (least loaded) as unhealthy
-    await asyncio.sleep(0.01)
-    replicas[0].state = ReplicaState.UNHEALTHY
+        # While they are waiting, mark replica 0 (least loaded) as unhealthy
+        await asyncio.sleep(0.01)
+        replicas[0].state = ReplicaState.UNHEALTHY
 
-    results = await asyncio.gather(*tasks)
+        results = await asyncio.gather(*tasks)
 
-    # All results must be the *healthy* replica (idx=1)
-    assert all(r.idx == 1 for r in results)
-    assert results[0].state == ReplicaState.HEALTHY
+        # All results must be the *healthy* replica (idx=1)
+        assert all(r.idx == 1 for r in results)
+        assert results[0].state == ReplicaState.HEALTHY
+    finally:
+        router.shutdown()
 
 
 @pytest.mark.asyncio
@@ -753,49 +760,52 @@ async def test_batching_router_two_batches_timing():
         batch_max_size=3,
         batch_max_wait_s=batch_wait_time,
     )
+    try:
+        # First batch: 2 requests that will timeout
+        start_time = time.time()
 
-    # First batch: 2 requests that will timeout
-    start_time = time.time()
+        # Create first batch tasks
+        first_batch_tasks = [
+            asyncio.create_task(router.get_replica(replicas)) for _ in range(2)
+        ]
 
-    # Create first batch tasks
-    first_batch_tasks = [
-        asyncio.create_task(router.get_replica(replicas)) for _ in range(2)
-    ]
+        # Wait for first batch to complete (should timeout after batch_wait_time)
+        first_results = await asyncio.gather(*first_batch_tasks)
+        first_batch_duration = time.time() - start_time
 
-    # Wait for first batch to complete (should timeout after batch_wait_time)
-    first_results = await asyncio.gather(*first_batch_tasks)
-    first_batch_duration = time.time() - start_time
+        # Verify first batch took approximately the timeout duration (tighter tolerance)
+        assert (
+            batch_wait_time <= first_batch_duration < batch_wait_time + 0.01
+        )  # 10ms tolerance on 50ms timeout
 
-    # Verify first batch took approximately the timeout duration (tighter tolerance)
-    assert (
-        batch_wait_time <= first_batch_duration < batch_wait_time + 0.01
-    )  # 10ms tolerance on 50ms timeout
+        # Verify first batch results (should pick lowest load replica)
+        assert all(r.idx == 0 for r in first_results)  # replica 0 has lower load
+        assert all(r.state == ReplicaState.HEALTHY for r in first_results)
 
-    # Verify first batch results (should pick lowest load replica)
-    assert all(r.idx == 0 for r in first_results)  # replica 0 has lower load
-    assert all(r.state == ReplicaState.HEALTHY for r in first_results)
+        # Second batch: 2 more requests (new timing cycle should start)
+        second_batch_start = time.time()
 
-    # Second batch: 2 more requests (new timing cycle should start)
-    second_batch_start = time.time()
+        # Create second batch tasks
+        second_batch_tasks = [
+            asyncio.create_task(router.get_replica(replicas)) for _ in range(2)
+        ]
 
-    # Create second batch tasks
-    second_batch_tasks = [
-        asyncio.create_task(router.get_replica(replicas)) for _ in range(2)
-    ]
+        # Wait for second batch to complete
+        second_results = await asyncio.gather(*second_batch_tasks)
+        second_batch_duration = time.time() - second_batch_start
 
-    # Wait for second batch to complete
-    second_results = await asyncio.gather(*second_batch_tasks)
-    second_batch_duration = time.time() - second_batch_start
+        # Verify second batch also took approximately the timeout duration (tighter tolerance)
+        assert batch_wait_time <= second_batch_duration < batch_wait_time + 0.01
 
-    # Verify second batch also took approximately the timeout duration (tighter tolerance)
-    assert batch_wait_time <= second_batch_duration < batch_wait_time + 0.01
+        # Verify second batch results
+        assert all(r.idx == 0 for r in second_results)  # should still pick lowest load
+        assert all(r.state == ReplicaState.HEALTHY for r in second_results)
 
-    # Verify second batch results
-    assert all(r.idx == 0 for r in second_results)  # should still pick lowest load
-    assert all(r.state == ReplicaState.HEALTHY for r in second_results)
+        # Ensure batch queue is empty after both batches
+        assert router._queue.qsize() == 0
+    finally:
+        router.shutdown()
 
-    # Ensure batch queue is empty after both batches
-    assert router._queue.qsize() == 0
 
 # Router integeration tests
 
@@ -856,4 +866,4 @@ async def test_session_router_assigns_and_updates_session_map_in_service():
         assert values2[assigned_idx] == values1[assigned_idx] + 1
 
     finally:
-        await service.shutdown()
+        await service.shutdown()