Skip to content

Commit 55b5910

Browse files
committed
fix: always run scheduler.step() on MLX executor to prevent GPU race
When has_waiting=False, scheduler.step() was called inline on the event loop thread while VLM vision encoding could be running on _mlx_executor. Two threads accessing Metal GPU simultaneously caused segfault. Now all scheduler steps run on the single-worker executor, serializing all MLX operations.
1 parent 26148f5 commit 55b5910

File tree

1 file changed

+8
-18
lines changed

1 file changed

+8
-18
lines changed

omlx/engine_core.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,12 @@ def is_running(self) -> bool:
140140
return self._running
141141

142142
async def _engine_loop(self) -> None:
143-
"""Main engine loop - hybrid executor for prefill vs generation.
143+
"""Main engine loop - runs scheduler steps on the MLX executor.
144144
145-
Prefill steps (long prompts) are run in a thread executor to keep
146-
the asyncio event loop responsive. Generation-only steps (~1-3ms)
147-
are called directly to avoid ~0.5-2ms context switch overhead,
148-
giving ~5-10% throughput improvement during sustained generation.
145+
All scheduler steps run on _mlx_executor (single-worker thread) to
146+
guarantee that MLX GPU operations are never concurrent. VLM vision
147+
encoding also runs on the same executor, so inline scheduler.step()
148+
on the event loop would race with vision mx.eval() and segfault.
149149
"""
150150
loop = asyncio.get_running_loop()
151151

@@ -156,19 +156,9 @@ async def _engine_loop(self) -> None:
156156
while self._running:
157157
try:
158158
if self.scheduler.has_requests():
159-
# Hybrid approach: use executor only when prefill is likely.
160-
# Prefill happens when there are waiting requests that need
161-
# to be inserted into the batch (may block for seconds).
162-
# Generation-only steps are fast (<3ms) and can run inline.
163-
has_waiting = self.scheduler.get_num_waiting() > 0
164-
if has_waiting:
165-
output = await loop.run_in_executor(
166-
self._mlx_executor, self.scheduler.step
167-
)
168-
else:
169-
output = self.scheduler.step()
170-
# Yield to event loop after inline step
171-
await asyncio.sleep(0)
159+
output = await loop.run_in_executor(
160+
self._mlx_executor, self.scheduler.step
161+
)
172162
self._steps_executed += 1
173163

174164
# Fast path: distribute outputs to collectors

0 commit comments

Comments
 (0)