Skip to content

Commit 52f78e4

Browse files
authored
[http://nvbugs/5649010][fix] fix test_auto_scaling.py::test_worker_restart timeout (#9775)
Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
1 parent 96d9b67 commit 52f78e4

File tree

2 files changed

+10
-12
lines changed

2 files changed

+10
-12
lines changed

tests/integration/defs/disaggregated/test_auto_scaling.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ def terminate(*args, show_log_lines=30, release_port=True):
262262
print(f"Failed to tail {arg.log_path}: {e}")
263263
print(f"Traceback: {traceback.format_exc()}")
264264
if arg.process:
265+
print(f"Killing process {arg.process.pid}")
265266
try:
266267
arg.process.kill()
267268
arg.process.wait(timeout=10)
@@ -274,6 +275,8 @@ def terminate(*args, show_log_lines=30, release_port=True):
274275
USED_PORTS.discard(arg.port)
275276
except Exception:
276277
print(f"Failed to terminate process {arg.process.pid}")
278+
else:
279+
print(f"Process is None on port {arg.port}")
277280

278281

279282
def request_completion(model_name, prompt, port):
@@ -396,7 +399,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
396399
port=disagg_port)
397400
print(response)
398401
# kill gen1, the request should fail
399-
terminate(gen_worker1, release_port=False)
402+
terminate(gen_worker1)
400403
await asyncio.sleep(CHECK_STATUS_INTERVAL)
401404
verify_cluster_info(False, 1, 0, port=disagg_port)
402405
with pytest.raises(Exception):
@@ -422,7 +425,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
422425
assert len(response.choices[0].text) >= 1
423426

424427
# kill ctx1, the request should fail
425-
terminate(ctx_worker1, release_port=False)
428+
terminate(ctx_worker1)
426429
await asyncio.sleep(CHECK_STATUS_INTERVAL)
427430
verify_cluster_info(False, 0, 1, port=disagg_port)
428431
with pytest.raises(Exception):
@@ -441,15 +444,11 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
441444
response_text = response.choices[0].text
442445
assert len(response.choices[0].text) >= 1
443446

444-
# restart ctx1 and gen1 with the same ports, we have 2 ctxs and 2 gens now
445-
ctx_worker1 = run_ctx_worker(model_name,
446-
worker_config,
447-
work_dir,
448-
port=ctx_worker1.port)
449-
gen_worker1 = run_gen_worker(model_name,
450-
worker_config,
451-
work_dir,
452-
port=gen_worker1.port)
447+
# start ctx1 and gen1 again, we have 2 ctxs and 2 gens now
448+
# Note: Do NOT start them with the same ports as the previous ones, the ports may be not released immediately after terminate,
449+
# causing a port conflict and test timeout.
450+
ctx_worker1 = run_ctx_worker(model_name, worker_config, work_dir)
451+
gen_worker1 = run_gen_worker(model_name, worker_config, work_dir)
453452
await wait_for_worker_ready(ctx_worker1.port)
454453
await wait_for_worker_ready(gen_worker1.port)
455454
await asyncio.sleep(CHECK_STATUS_INTERVAL)

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,6 @@ unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_
437437
unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644)
438438
unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644)
439439
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
440-
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5649010)
441440
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653)
442441
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5722653)
443442
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5722653)

0 commit comments

Comments
 (0)