@@ -145,6 +145,9 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
145145
146146 engine = sgl .Engine (server_args = server_args )
147147
148+ # Perform dummy warmup for prefill worker to avoid initial TTFT hit
149+ await _warmup_prefill_engine (engine , server_args )
150+
148151 component = runtime .namespace (dynamo_args .namespace ).component (
149152 dynamo_args .component
150153 )
@@ -405,6 +408,41 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
405408 handler .cleanup ()
406409
407410
411+ async def _warmup_prefill_engine (engine : sgl .Engine , server_args ) -> None :
412+ """Perform warmup request for prefill engine to reduce initial TTFT."""
413+ logging .info ("Start of prefill disaggregation warmup ..." )
414+ try :
415+ from sglang .srt .disaggregation .utils import FAKE_BOOTSTRAP_HOST
416+ from sglang .srt .sampling .sampling_params import SamplingParams
417+
418+ sampling_params = SamplingParams (
419+ temperature = 0.0 ,
420+ max_new_tokens = 8 ,
421+ ignore_eos = True ,
422+ )
423+
424+ # Timeout: 1800s (30 min) for deep gemm precache
425+ async def _do_warmup ():
426+ results = await engine .async_generate (
427+ input_ids = [0 , 1 , 2 , 3 ],
428+ sampling_params = sampling_params ,
429+ stream = True ,
430+ bootstrap_host = FAKE_BOOTSTRAP_HOST ,
431+ bootstrap_port = server_args .disaggregation_bootstrap_port ,
432+ bootstrap_room = 999999 ,
433+ )
434+ # Consume the stream
435+ async for _ in results :
436+ pass
437+
438+ await asyncio .wait_for (_do_warmup (), timeout = 1800 )
439+ logging .info ("Prefill warmup completed" )
440+ except asyncio .TimeoutError :
441+ logging .warning ("Prefill warmup timed out after 1800s" )
442+ except Exception as e :
443+ logging .warning (f"Prefill warmup failed: { e } " )
444+
445+
408446async def graceful_shutdown (runtime ):
409447 logging .info ("Received shutdown signal, shutting down DistributedRuntime" )
410448 runtime .shutdown ()
0 commit comments