Skip to content

Commit 7ac9ff9

Browse files
Levi080513claude
andcommitted
fix: CHWBL scheduler should use own load balancing when initial replica is not a candidate
When the initial replica from consistent hashing is not in the candidate list, the scheduler now continues walking the ring with CHWBL load-aware logic instead of returning all candidates to Ray's default scheduling. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 20e98b3 commit 7ac9ff9

File tree

1 file changed

+9
-7
lines changed

1 file changed

+9
-7
lines changed

cluster-image-builder/serve/_replica_scheduler/chwbl_scheduler.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,6 @@ async def choose_replicas(
9999
replica_hash, replica_idx = self._search(payload_hash)
100100
initial_replica_id = self._hash_to_replica_id[replica_hash]
101101

102-
if initial_replica_id not in candidate_map:
103-
return [candidate_replicas]
104-
105102
logger.debug(
106103
f"CHWBL: Initial lookup for payload hash {payload_hash} -> "
107104
f"replica {initial_replica_id}"
@@ -112,6 +109,7 @@ async def choose_replicas(
112109

113110
# Start from the initial replica and check load constraints
114111
checked_replica_ids: Set[ReplicaID] = set()
112+
default_replica_id = None
115113
current_idx = replica_idx
116114
while len(checked_replica_ids) < len(candidate_map):
117115
current_hash = self._sorted_hashes[current_idx]
@@ -124,6 +122,9 @@ async def choose_replicas(
124122

125123
checked_replica_ids.add(current_replica_id)
126124

125+
if default_replica_id is None:
126+
default_replica_id = current_replica_id
127+
127128
# Check if this replica meets the load constraints using snapshot
128129
if self._check_load_with_snapshot(current_replica_id, load_snapshot):
129130
logger.info(
@@ -136,13 +137,14 @@ async def choose_replicas(
136137
# Move to next replica
137138
current_idx = (current_idx + 1) % len(self._sorted_hashes)
138139

139-
# All replicas overloaded, fallback to primary to preserve affinity
140+
# All replicas overloaded, fallback to first candidate on ring to preserve affinity
141+
fallback_id = default_replica_id
140142
logger.warning(
141-
f"CHWBL: Using primary replica {initial_replica_id} "
143+
f"CHWBL: Using fallback replica {fallback_id} "
142144
f"as no replica met load factor for payload hash {payload_hash}, "
143-
f"snapshot_load={load_snapshot.get(initial_replica_id, 0)}"
145+
f"snapshot_load={load_snapshot.get(fallback_id, 0)}"
144146
)
145-
return [[candidate_map[initial_replica_id]]]
147+
return [[candidate_map[fallback_id]]]
146148

147149
def _check_load_with_snapshot(
148150
self, replica_id: ReplicaID, load_snapshot: Dict[ReplicaID, int]

0 commit comments

Comments
 (0)