@@ -262,6 +262,7 @@ def terminate(*args, show_log_lines=30, release_port=True):
262262 print (f"Failed to tail { arg .log_path } : { e } " )
263263 print (f"Traceback: { traceback .format_exc ()} " )
264264 if arg .process :
265+ print (f"Killing process { arg .process .pid } " )
265266 try :
266267 arg .process .kill ()
267268 arg .process .wait (timeout = 10 )
@@ -274,6 +275,8 @@ def terminate(*args, show_log_lines=30, release_port=True):
274275 USED_PORTS .discard (arg .port )
275276 except Exception :
276277 print (f"Failed to terminate process { arg .process .pid } " )
278+ else :
279+ print (f"Process is None on port { arg .port } " )
277280
278281
279282def request_completion (model_name , prompt , port ):
@@ -396,7 +399,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
396399 port = disagg_port )
397400 print (response )
398401 # kill gen1, the request should fail
399- terminate (gen_worker1 , release_port = False )
402+ terminate (gen_worker1 )
400403 await asyncio .sleep (CHECK_STATUS_INTERVAL )
401404 verify_cluster_info (False , 1 , 0 , port = disagg_port )
402405 with pytest .raises (Exception ):
@@ -422,7 +425,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
422425 assert len (response .choices [0 ].text ) >= 1
423426
424427 # kill ctx1, the request should fail
425- terminate (ctx_worker1 , release_port = False )
428+ terminate (ctx_worker1 )
426429 await asyncio .sleep (CHECK_STATUS_INTERVAL )
427430 verify_cluster_info (False , 0 , 1 , port = disagg_port )
428431 with pytest .raises (Exception ):
@@ -441,15 +444,11 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
441444 response_text = response .choices [0 ].text
442445 assert len (response .choices [0 ].text ) >= 1
443446
444- # restart ctx1 and gen1 with the same ports, we have 2 ctxs and 2 gens now
445- ctx_worker1 = run_ctx_worker (model_name ,
446- worker_config ,
447- work_dir ,
448- port = ctx_worker1 .port )
449- gen_worker1 = run_gen_worker (model_name ,
450- worker_config ,
451- work_dir ,
452- port = gen_worker1 .port )
447+ # start ctx1 and gen1 again, we have 2 ctxs and 2 gens now
448+ # Note: Do NOT start them with the same ports as the previous ones, the ports may be not released immediately after terminate,
449+ # causing a port conflict and test timeout.
450+ ctx_worker1 = run_ctx_worker (model_name , worker_config , work_dir )
451+ gen_worker1 = run_gen_worker (model_name , worker_config , work_dir )
453452 await wait_for_worker_ready (ctx_worker1 .port )
454453 await wait_for_worker_ready (gen_worker1 .port )
455454 await asyncio .sleep (CHECK_STATUS_INTERVAL )
0 commit comments