fix: cp move some ft to nightly (#7279)

dmitry-tokarev-nv · web-flow · commit f874fe9e3cce · 2026-03-12T14:22:42.000-04:00
Signed-off-by: Dmitry Tokarev &lt;dtokarev@nvidia.com&gt;
diff --git a/tests/fault_tolerance/cancellation/test_sglang.py b/tests/fault_tolerance/cancellation/test_sglang.py
@@ -33,7 +33,7 @@
     pytest.mark.sglang,
     pytest.mark.e2e,
     pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.nightly,
     pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
 
diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -36,7 +36,7 @@
     pytest.mark.gpu_1,
     pytest.mark.e2e,
     pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.nightly,
     pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
     pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
 ]
diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -34,8 +34,6 @@
     pytest.mark.vllm,
     pytest.mark.e2e,
     pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
-    pytest.mark.gpu_1,
     pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
 
@@ -203,6 +201,8 @@ def is_ready(self, response) -> bool:
 
 
 @pytest.mark.timeout(110)  # 3x average
+@pytest.mark.post_merge
+@pytest.mark.gpu_1
 def test_request_cancellation_vllm_aggregated(
     request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -284,6 +284,8 @@ def test_request_cancellation_vllm_aggregated(
 
 
 @pytest.mark.timeout(150)  # 3x average
+@pytest.mark.nightly
+@pytest.mark.gpu_2
 def test_request_cancellation_vllm_decode_cancel(
     request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -365,6 +367,8 @@ def test_request_cancellation_vllm_decode_cancel(
 
 
 @pytest.mark.timeout(150)  # 3x average
+@pytest.mark.nightly
+@pytest.mark.gpu_2
 def test_request_cancellation_vllm_prefill_cancel(
     request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
diff --git a/tests/fault_tolerance/migration/test_sglang.py b/tests/fault_tolerance/migration/test_sglang.py
@@ -31,7 +31,6 @@
     pytest.mark.gpu_1,
     pytest.mark.e2e,
     pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
     pytest.mark.parametrize(
         "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
     ),
@@ -211,6 +210,7 @@ def is_ready(self, response) -> bool:
 
 
 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.post_merge
 def test_request_migration_sglang_aggregated(
     request,
     runtime_services_dynamic_ports,
@@ -262,6 +262,7 @@ def test_request_migration_sglang_aggregated(
 @pytest.mark.skip(reason="Cannot reliably migrate at Prefill that finish < 1 ms")
 @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_sglang_prefill(
     request,
     runtime_services_dynamic_ports,
@@ -330,6 +331,7 @@ def test_request_migration_sglang_prefill(
 
 @pytest.mark.skip(reason="KV cache transfer may fail")
 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_sglang_kv_transfer(
     request,
     runtime_services_dynamic_ports,
@@ -397,6 +399,7 @@ def test_request_migration_sglang_kv_transfer(
 
 
 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_sglang_decode(
     request,
     runtime_services_dynamic_ports,
diff --git a/tests/fault_tolerance/migration/test_trtllm.py b/tests/fault_tolerance/migration/test_trtllm.py
@@ -31,7 +31,6 @@
     pytest.mark.gpu_1,
     pytest.mark.e2e,
     pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
     pytest.mark.parametrize(
         "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
     ),
@@ -188,8 +187,8 @@ def is_ready(self, response) -> bool:
         return False
 
 
-@pytest.mark.timeout(290)  # 3x average
-@pytest.mark.post_merge
+@pytest.mark.timeout(290)
+@pytest.mark.post_merge  # 3x average
 def test_request_migration_trtllm_aggregated(
     request,
     runtime_services_dynamic_ports,
@@ -240,6 +239,7 @@ def test_request_migration_trtllm_aggregated(
 
 @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_trtllm_prefill(
     request,
     runtime_services_dynamic_ports,
@@ -308,6 +308,7 @@ def test_request_migration_trtllm_prefill(
 
 @pytest.mark.skip(reason="Decode worker can get stuck downloading kv cache")
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_trtllm_kv_transfer(
     request,
     runtime_services_dynamic_ports,
@@ -375,6 +376,7 @@ def test_request_migration_trtllm_kv_transfer(
 
 
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.post_merge
 def test_request_migration_trtllm_decode(
     request,
     runtime_services_dynamic_ports,
diff --git a/tests/fault_tolerance/migration/test_vllm.py b/tests/fault_tolerance/migration/test_vllm.py
@@ -32,7 +32,6 @@
     pytest.mark.gpu_1,
     pytest.mark.e2e,
     pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
     pytest.mark.parametrize(
         "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
     ),
@@ -208,6 +207,7 @@ def is_ready(self, response) -> bool:
 
 
 @pytest.mark.timeout(290)  # 3x average
+@pytest.mark.post_merge
 def test_request_migration_vllm_aggregated(
     request,
     runtime_services_dynamic_ports,
@@ -258,6 +258,7 @@ def test_request_migration_vllm_aggregated(
 
 @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_vllm_prefill(
     request,
     runtime_services_dynamic_ports,
@@ -335,6 +336,7 @@ def test_request_migration_vllm_prefill(
     ),
 )
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_vllm_kv_transfer(
     request,
     runtime_services_dynamic_ports,
@@ -412,6 +414,7 @@ def test_request_migration_vllm_kv_transfer(
     ),
 )
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_vllm_decode(
     request,
     runtime_services_dynamic_ports,

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`pytest.mark.sglang,`
`34`	`34`	`pytest.mark.e2e,`
`35`	`35`	`pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),`
`36`		`- pytest.mark.post_merge, # post_merge to pinpoint failure commit`
	`36`	`+ pytest.mark.nightly,`
`37`	`37`	`pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),`
`38`	`38`	`]`
`39`	`39`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`pytest.mark.gpu_1,`
`37`	`37`	`pytest.mark.e2e,`
`38`	`38`	`pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),`
`39`		`- pytest.mark.post_merge, # post_merge to pinpoint failure commit`
	`39`	`+ pytest.mark.nightly,`
`40`	`40`	`pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),`
`41`	`41`	`pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),`
`42`	`42`	`]`