ai-dynamo · rmccorm4 · Nov 8, 2025 · Nov 8, 2025
@@ -149,6 +149,7 @@ def is_ready(self, response) -> bool:
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_sglang_aggregated(request, predownload_models):
     """
     Test ETCD High Availability with leader failover using SGLang.
@@ -209,6 +210,7 @@ def test_etcd_ha_failover_sglang_aggregated(request, predownload_models):
 @pytest.mark.gpu_2
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_sglang_disaggregated(
     request, predownload_models, set_ucx_tls_no_mm
 ):
@@ -277,6 +279,7 @@ def test_etcd_ha_failover_sglang_disaggregated(
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_sglang_aggregated(request, predownload_models):
     """
     Test that frontend and worker shut down when single ETCD node is terminated using SGLang.
@@ -333,6 +336,7 @@ def test_etcd_non_ha_shutdown_sglang_aggregated(request, predownload_models):
 @pytest.mark.gpu_2
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_sglang_disaggregated(
     request, predownload_models, set_ucx_tls_no_mm
 ):

@@ -135,6 +135,7 @@ def is_ready(self, response) -> bool:
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_trtllm_aggregated(request, predownload_models):
     """
     Test ETCD High Availability with leader failover for TRT-LLM in aggregated mode.
@@ -195,6 +196,7 @@ def test_etcd_ha_failover_trtllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_trtllm_disaggregated(
     request, predownload_models, set_ucx_tls_no_mm
 ):
@@ -262,6 +264,7 @@ def test_etcd_ha_failover_trtllm_disaggregated(
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_trtllm_aggregated(request, predownload_models):
     """
     Test that frontend and worker shut down when single ETCD node is terminated for TRT-LLM in aggregated mode.
@@ -321,6 +324,7 @@ def test_etcd_non_ha_shutdown_trtllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_trtllm_disaggregated(
     request, predownload_models, set_ucx_tls_no_mm
 ):

@@ -117,6 +117,7 @@ def is_ready(self, response) -> bool:
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_vllm_aggregated(request, predownload_models):
     """
     Test ETCD High Availability with leader failover.
@@ -175,6 +176,7 @@ def test_etcd_ha_failover_vllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_vllm_disaggregated(
     request, predownload_models, set_ucx_tls_no_mm
 ):
@@ -239,6 +241,7 @@ def test_etcd_ha_failover_vllm_disaggregated(
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_vllm_aggregated(request, predownload_models):
     """
     Test that frontend and worker shut down when single ETCD node is terminated.
@@ -293,6 +296,7 @@ def test_etcd_non_ha_shutdown_vllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_vllm_disaggregated(
     request, predownload_models, set_ucx_tls_no_mm
 ):