_xfail_gloo_windows

Borda · Borda · commit 514848f6c6ca · 2025-08-12T16:59:36.000+02:00
diff --git a/src/lightning/pytorch/utilities/imports.py b/src/lightning/pytorch/utilities/imports.py
@@ -25,7 +25,7 @@
 _TORCHMETRICS_GREATER_EQUAL_0_9_1 = RequirementCache("torchmetrics>=0.9.1")
 _TORCHMETRICS_GREATER_EQUAL_0_11 = RequirementCache("torchmetrics>=0.11.0")  # using new API with task
 _TORCHMETRICS_GREATER_EQUAL_1_0_0 = RequirementCache("torchmetrics>=1.0.0")
-_TORCH_GREATER_EQUAL_2_8 = RequirementCache("torch>=2.8.0")
+_TORCH_EQUAL_2_8 = RequirementCache("torch>=2.8.0,<2.9.0")
 
 _OMEGACONF_AVAILABLE = package_available("omegaconf")
 _TORCHVISION_AVAILABLE = RequirementCache("torchvision")
diff --git a/tests/tests_pytorch/callbacks/test_spike.py b/tests/tests_pytorch/callbacks/test_spike.py
@@ -7,7 +7,7 @@
 from lightning.fabric.utilities.spike import TrainingSpikeException
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.callbacks.spike import SpikeDetection
-from tests_pytorch.helpers.runif import _XFAIL_GLOO_WINDOWS, RunIf
+from tests_pytorch.helpers.runif import RunIf, _xfail_gloo_windows
 
 
 class IdentityModule(LightningModule):
@@ -54,14 +54,14 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
     # NOTE FOR ALL FOLLOWING TESTS:
     # adding run on linux only because multiprocessing on other platforms takes forever
     [
-        pytest.param(0, 1, None, True),
-        pytest.param(0, 1, None, False),
-        pytest.param(0, 1, float("inf"), True, marks=_XFAIL_GLOO_WINDOWS),
-        pytest.param(0, 1, float("inf"), False, marks=_XFAIL_GLOO_WINDOWS),
-        pytest.param(0, 1, float("-inf"), True, marks=_XFAIL_GLOO_WINDOWS),
-        pytest.param(0, 1, float("-inf"), False, marks=_XFAIL_GLOO_WINDOWS),
-        pytest.param(0, 1, float("NaN"), True, marks=_XFAIL_GLOO_WINDOWS),
-        pytest.param(0, 1, float("NaN"), False, marks=_XFAIL_GLOO_WINDOWS),
+        pytest.param(0, 1, None, True, marks=_xfail_gloo_windows),
+        pytest.param(0, 1, None, False, marks=_xfail_gloo_windows),
+        pytest.param(0, 1, float("inf"), True, marks=_xfail_gloo_windows),
+        pytest.param(0, 1, float("inf"), False, marks=_xfail_gloo_windows),
+        pytest.param(0, 1, float("-inf"), True, marks=_xfail_gloo_windows),
+        pytest.param(0, 1, float("-inf"), False, marks=_xfail_gloo_windows),
+        pytest.param(0, 1, float("NaN"), True, marks=_xfail_gloo_windows),
+        pytest.param(0, 1, float("NaN"), False, marks=_xfail_gloo_windows),
         pytest.param(0, 2, None, True, marks=RunIf(linux_only=True)),
         pytest.param(0, 2, None, False, marks=RunIf(linux_only=True)),
         pytest.param(1, 2, None, True, marks=RunIf(linux_only=True)),
diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py
@@ -14,7 +14,7 @@
 import pytest
 
 from lightning.fabric.utilities.imports import _IS_WINDOWS
-from lightning.pytorch.utilities.imports import _TORCH_GREATER_EQUAL_2_8
+from lightning.pytorch.utilities.imports import _TORCH_EQUAL_2_8
 from lightning.pytorch.utilities.testing import _runif_reasons
 
 
@@ -24,9 +24,9 @@ def RunIf(**kwargs):
 
 
 # todo: RuntimeError: makeDeviceForHostname(): unsupported gloo device
-_XFAIL_GLOO_WINDOWS = pytest.mark.xfail(
+_xfail_gloo_windows = pytest.mark.xfail(
     RuntimeError,
     strict=True,
-    condition=(_IS_WINDOWS and _TORCH_GREATER_EQUAL_2_8),
+    condition=(_IS_WINDOWS and _TORCH_EQUAL_2_8),
     reason="makeDeviceForHostname(): unsupported gloo device",
 )
diff --git a/tests/tests_pytorch/loops/test_prediction_loop.py b/tests/tests_pytorch/loops/test_prediction_loop.py
@@ -19,6 +19,7 @@
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset
 from lightning.pytorch.overrides.distributed import _IndexBatchSamplerWrapper
+from tests_pytorch.helpers.runif import _xfail_gloo_windows
 
 
 def test_prediction_loop_stores_predictions(tmp_path):
@@ -51,6 +52,7 @@ def predict_step(self, batch, batch_idx):
     assert trainer.predict_loop.predictions == []
 
 
+@_xfail_gloo_windows
 @pytest.mark.parametrize("use_distributed_sampler", [False, True])
 def test_prediction_loop_batch_sampler_set_epoch_called(tmp_path, use_distributed_sampler):
     """Tests that set_epoch is called on the dataloader's batch sampler (if any) during prediction."""
diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py
@@ -22,7 +22,7 @@
 from lightning.fabric.plugins.environments import SLURMEnvironment
 from lightning.pytorch import Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset
-from tests_pytorch.helpers.runif import RunIf
+from tests_pytorch.helpers.runif import RunIf, _xfail_gloo_windows
 
 
 class AMPTestModel(BoringModel):
@@ -53,7 +53,7 @@ def _assert_autocast_enabled(self):
     [
         ("single_device", "16-mixed", 1),
         ("single_device", "bf16-mixed", 1),
-        ("ddp_spawn", "16-mixed", 2),
+        pytest.param("ddp_spawn", "16-mixed", 2, marks=_xfail_gloo_windows),
         pytest.param("ddp_spawn", "bf16-mixed", 2, marks=RunIf(skip_windows=True)),
     ],
 )
diff --git a/tests/tests_pytorch/serve/test_servable_module_validator.py b/tests/tests_pytorch/serve/test_servable_module_validator.py
@@ -5,6 +5,7 @@
 from lightning.pytorch import Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.serve.servable_module_validator import ServableModule, ServableModuleValidator
+from tests_pytorch.helpers.runif import _xfail_gloo_windows
 
 
 class ServableBoringModel(BoringModel, ServableModule):
@@ -28,13 +29,14 @@ def configure_response(self):
         return {"output": [0, 1]}
 
 
-@pytest.mark.xfail(strict=False, reason="test is too flaky in CI")  # todo
+@pytest.mark.flaky(reruns=3)
 def test_servable_module_validator():
     model = ServableBoringModel()
     callback = ServableModuleValidator()
     callback.on_train_start(Trainer(accelerator="cpu"), model)
 
 
+@_xfail_gloo_windows
 @pytest.mark.flaky(reruns=3)
 def test_servable_module_validator_with_trainer(tmp_path, mps_count_0):
     callback = ServableModuleValidator()
diff --git a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py
@@ -25,7 +25,7 @@
 from lightning.pytorch.strategies import DDPStrategy
 from lightning.pytorch.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher
 from lightning.pytorch.trainer.states import TrainerFn
-from tests_pytorch.helpers.runif import RunIf
+from tests_pytorch.helpers.runif import RunIf, _xfail_gloo_windows
 
 
 @mock.patch("lightning.pytorch.strategies.launchers.multiprocessing.mp.get_all_start_methods", return_value=[])
@@ -194,6 +194,7 @@ def on_fit_start(self) -> None:
         assert torch.equal(self.layer.weight.data, self.tied_layer.weight.data)
 
 
+@_xfail_gloo_windows
 def test_memory_sharing_disabled(tmp_path):
     """Test that the multiprocessing launcher disables memory sharing on model parameters and buffers to avoid race
     conditions on model updates."""
@@ -219,6 +220,7 @@ def test_check_for_missing_main_guard():
         launcher.launch(function=Mock())
 
 
+@_xfail_gloo_windows
 def test_fit_twice_raises(mps_count_0):
     model = BoringModel()
     trainer = Trainer(
diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py
@@ -37,7 +37,7 @@
 from lightning.pytorch.utilities.combined_loader import CombinedLoader
 from lightning.pytorch.utilities.data import _update_dataloader
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
-from tests_pytorch.helpers.runif import RunIf
+from tests_pytorch.helpers.runif import RunIf, _xfail_gloo_windows
 
 
 @RunIf(skip_windows=True)
@@ -123,6 +123,7 @@ def on_train_end(self):
             self.ctx.__exit__(None, None, None)
 
 
+@_xfail_gloo_windows
 @pytest.mark.parametrize("num_workers", [0, 1, 2])
 def test_dataloader_persistent_workers_performance_warning(num_workers, tmp_path):
     """Test that when the multiprocessing start-method is 'spawn', we recommend setting `persistent_workers=True`."""
diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
@@ -35,7 +35,7 @@
 from lightning.pytorch.trainer.states import RunningStage
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_11 as _TM_GE_0_11
-from tests_pytorch.helpers.runif import RunIf
+from tests_pytorch.helpers.runif import RunIf, _xfail_gloo_windows
 
 
 def test__training_step__log(tmp_path):
@@ -346,7 +346,7 @@ def validation_step(self, batch, batch_idx):
     ("devices", "accelerator"),
     [
         (1, "cpu"),
-        (2, "cpu"),
+        pytest.param(2, "cpu", marks=_xfail_gloo_windows),
         pytest.param(2, "gpu", marks=RunIf(min_cuda_gpus=2)),
     ],
 )