Skip to content

Commit bc02088

Browse files
feat(sglang): Add dummy warmup req for prefill (#4058)
Co-authored-by: ishandhanani <[email protected]>
1 parent 4beada3 commit bc02088

File tree

1 file changed

+38
-0
lines changed
  • components/src/dynamo/sglang

1 file changed

+38
-0
lines changed

components/src/dynamo/sglang/main.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,9 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
145145

146146
engine = sgl.Engine(server_args=server_args)
147147

148+
# Perform dummy warmup for prefill worker to avoid initial TTFT hit
149+
await _warmup_prefill_engine(engine, server_args)
150+
148151
component = runtime.namespace(dynamo_args.namespace).component(
149152
dynamo_args.component
150153
)
@@ -405,6 +408,41 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
405408
handler.cleanup()
406409

407410

411+
async def _warmup_prefill_engine(engine: sgl.Engine, server_args) -> None:
412+
"""Perform warmup request for prefill engine to reduce initial TTFT."""
413+
logging.info("Start of prefill disaggregation warmup ...")
414+
try:
415+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
416+
from sglang.srt.sampling.sampling_params import SamplingParams
417+
418+
sampling_params = SamplingParams(
419+
temperature=0.0,
420+
max_new_tokens=8,
421+
ignore_eos=True,
422+
)
423+
424+
# Timeout: 1800s (30 min) for deep gemm precache
425+
async def _do_warmup():
426+
results = await engine.async_generate(
427+
input_ids=[0, 1, 2, 3],
428+
sampling_params=sampling_params,
429+
stream=True,
430+
bootstrap_host=FAKE_BOOTSTRAP_HOST,
431+
bootstrap_port=server_args.disaggregation_bootstrap_port,
432+
bootstrap_room=999999,
433+
)
434+
# Consume the stream
435+
async for _ in results:
436+
pass
437+
438+
await asyncio.wait_for(_do_warmup(), timeout=1800)
439+
logging.info("Prefill warmup completed")
440+
except asyncio.TimeoutError:
441+
logging.warning("Prefill warmup timed out after 1800s")
442+
except Exception as e:
443+
logging.warning(f"Prefill warmup failed: {e}")
444+
445+
408446
async def graceful_shutdown(runtime):
409447
logging.info("Received shutdown signal, shutting down DistributedRuntime")
410448
runtime.shutdown()

0 commit comments

Comments
 (0)