refactor: remove unnecessary threading.Lock from CHWBL scheduler

Levi080513 · claude · Levi080513 · commit 20e98b3f4ab3 · 2026-02-14T15:50:51.000+08:00
Ray Serve runs in a single-threaded asyncio event loop. The locked
sections contain no await points, so there is no concurrent access.
threading.Lock is unnecessary and could potentially block the event
loop if contention ever occurred.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/cluster-image-builder/serve/_replica_scheduler/chwbl_scheduler.py b/cluster-image-builder/serve/_replica_scheduler/chwbl_scheduler.py
@@ -1,7 +1,6 @@
 import hashlib
 import logging
 import bisect
-import threading
 from typing import Dict, List, Set, Tuple, Optional
 
 from ray.serve._private.common import ReplicaID
@@ -39,9 +38,6 @@ def __init__(self, *args, **kwargs):
         self._hash_to_replica_id: Dict[int, ReplicaID] = {}  # Maps hash points to replica IDs
         self._sorted_hashes: List[int] = []  # Sorted list of hash points for binary search
 
-        # Lock for thread-safe updates
-        self._load_lock = threading.Lock()
-
     def initialize_state(
         self,
         virtual_nodes_per_replica: int = 100,
@@ -61,18 +57,17 @@ def initialize_state(
         )
 
     def _create_load_snapshot(self) -> Dict[ReplicaID, int]:
-        """Create a snapshot of current replica loads (thread-safe).
+        """Create a snapshot of current replica loads.
 
         Returns:
             A dictionary mapping replica_id to its current load.
         """
-        with self._load_lock:
-            snapshot = {}
-            for replica_id in self._replicas:
-                load = self._replica_queue_len_cache.get(replica_id)
-                snapshot[replica_id] = load if load is not None else 0
+        snapshot = {}
+        for replica_id in self._replicas:
+            load = self._replica_queue_len_cache.get(replica_id)
+            snapshot[replica_id] = load if load is not None else 0
 
-            return snapshot
+        return snapshot
 
     async def choose_replicas(
         self,
@@ -252,21 +247,20 @@ def on_request_completed(self, replica_id: ReplicaID, internal_request_id: str):
 
         Called by the router when a request finishes processing.
         """
-        with self._load_lock:
-            current_load = self._replica_queue_len_cache.get(replica_id)
-            if current_load is None:
-                logger.warning(
-                    f"CHWBL: Attempted to decrement load for {replica_id} but no load info exists"
-                )
-                return
+        current_load = self._replica_queue_len_cache.get(replica_id)
+        if current_load is None:
+            logger.warning(
+                f"CHWBL: Attempted to decrement load for {replica_id} but no load info exists"
+            )
+            return
 
-            new_load = max(0, current_load - 1)  # Ensure non-negative
-            self._replica_queue_len_cache.update(replica_id, new_load)
+        new_load = max(0, current_load - 1)  # Ensure non-negative
+        self._replica_queue_len_cache.update(replica_id, new_load)
 
-            logger.debug(
-                f"CHWBL: Decremented load for {replica_id}: "
-                f"{current_load} -> {new_load}"
-            )
+        logger.debug(
+            f"CHWBL: Decremented load for {replica_id}: "
+            f"{current_load} -> {new_load}"
+        )
 
     def _extract_cache_key(self, payload, request_id: str) -> str:
         """Extract cache key from OpenAI-compatible chat completions payload.