[http://nvbugs/5649010][fix] fix test_auto_scaling.py::test_worker_restart timeout (#9775)

reasonsolo · web-flow · commit 52f78e4000cf · 2025-12-08T03:26:01.000-08:00
Signed-off-by: Lizhi Zhou &lt;1432185+reasonsolo@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/disaggregated/test_auto_scaling.py b/tests/integration/defs/disaggregated/test_auto_scaling.py
@@ -262,6 +262,7 @@ def terminate(*args, show_log_lines=30, release_port=True):
                 print(f"Failed to tail {arg.log_path}: {e}")
                 print(f"Traceback: {traceback.format_exc()}")
             if arg.process:
+                print(f"Killing process {arg.process.pid}")
                 try:
                     arg.process.kill()
                     arg.process.wait(timeout=10)
@@ -274,6 +275,8 @@ def terminate(*args, show_log_lines=30, release_port=True):
                         USED_PORTS.discard(arg.port)
                 except Exception:
                     print(f"Failed to terminate process {arg.process.pid}")
+            else:
+                print(f"Process is None on port {arg.port}")
 
 
 def request_completion(model_name, prompt, port):
@@ -396,7 +399,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
                                       port=disagg_port)
         print(response)
         # kill gen1, the request should fail
-        terminate(gen_worker1, release_port=False)
+        terminate(gen_worker1)
         await asyncio.sleep(CHECK_STATUS_INTERVAL)
         verify_cluster_info(False, 1, 0, port=disagg_port)
         with pytest.raises(Exception):
@@ -422,7 +425,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
         assert len(response.choices[0].text) >= 1
 
         # kill ctx1, the request should fail
-        terminate(ctx_worker1, release_port=False)
+        terminate(ctx_worker1)
         await asyncio.sleep(CHECK_STATUS_INTERVAL)
         verify_cluster_info(False, 0, 1, port=disagg_port)
         with pytest.raises(Exception):
@@ -441,15 +444,11 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
         response_text = response.choices[0].text
         assert len(response.choices[0].text) >= 1
 
-        # restart ctx1 and gen1 with the same ports, we have 2 ctxs and 2 gens now
-        ctx_worker1 = run_ctx_worker(model_name,
-                                     worker_config,
-                                     work_dir,
-                                     port=ctx_worker1.port)
-        gen_worker1 = run_gen_worker(model_name,
-                                     worker_config,
-                                     work_dir,
-                                     port=gen_worker1.port)
+        # start ctx1 and gen1 again, we have 2 ctxs and 2 gens now
+        # Note: Do NOT start them with the same ports as the previous ones, the ports may be not released immediately after terminate,
+        # causing a port conflict and test timeout.
+        ctx_worker1 = run_ctx_worker(model_name, worker_config, work_dir)
+        gen_worker1 = run_gen_worker(model_name, worker_config, work_dir)
         await wait_for_worker_ready(ctx_worker1.port)
         await wait_for_worker_ready(gen_worker1.port)
         await asyncio.sleep(CHECK_STATUS_INTERVAL)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -437,7 +437,6 @@ unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_
 unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644)
 unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
-disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5649010)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5722653)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5722653)