Lightning-AI · Borda · Jun 12, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 12, 2025
@@ -154,8 +154,8 @@ def load_readme_description(path_dir: str, homepage: str, version: str) -> str:
 
     """
     path_readme = os.path.join(path_dir, "README.md")
-    with open(path_readme, encoding="utf-8") as fo:
-        text = fo.read()
+    with open(path_readme, encoding="utf-8") as fopen:
+        text = fopen.read()
 
     # drop images from readme
     text = text.replace(
@@ -308,17 +308,17 @@ def copy_replace_imports(
         if ext in (".pyc",):
             continue
         # Try to parse everything else
-        with open(fp, encoding="utf-8") as fo:
+        with open(fp, encoding="utf-8") as fopen:
             try:
-                lines = fo.readlines()
+                lines = fopen.readlines()
             except UnicodeDecodeError:
                 # a binary file, skip
                 print(f"Skipped replacing imports for {fp}")
                 continue
         lines = _replace_imports(lines, list(zip(source_imports, target_imports)), lightning_by=lightning_by)
         os.makedirs(os.path.dirname(fp_new), exist_ok=True)
-        with open(fp_new, "w", encoding="utf-8") as fo:
-            fo.writelines(lines)
+        with open(fp_new, "w", encoding="utf-8") as fopen:
+            fopen.writelines(lines)
 
 
 def create_mirror_package(source_dir: str, package_mapping: dict[str, str]) -> None:
@@ -370,10 +370,10 @@ def _prune_packages(req_file: str, packages: Sequence[str]) -> None:
 
     @staticmethod
     def _replace_min(fname: str) -> None:
-        with open(fname, encoding="utf-8") as fo:
-            req = fo.read().replace(">=", "==")
-        with open(fname, "w", encoding="utf-8") as fw:
-            fw.write(req)
+        with open(fname, encoding="utf-8") as fopen:
+            req = fopen.read().replace(">=", "==")
+        with open(fname, "w", encoding="utf-8") as fwrite:
+            fwrite.write(req)
 
     @staticmethod
     def replace_oldest_ver(requirement_fnames: Sequence[str] = REQUIREMENT_FILES_ALL) -> None:
@@ -471,15 +471,15 @@ def convert_version2nightly(ver_file: str = "src/version.info") -> None:
         """Load the actual version and convert it to the nightly version."""
         from datetime import datetime
 
-        with open(ver_file) as fo:
-            version = fo.read().strip()
+        with open(ver_file) as fopen:
+            version = fopen.read().strip()
         # parse X.Y.Z version and prune any suffix
         vers = re.match(r"(\d+)\.(\d+)\.(\d+).*", version)
         # create timestamp  YYYYMMDD
         timestamp = datetime.now().strftime("%Y%m%d")
         version = f"{'.'.join(vers.groups())}.dev{timestamp}"
-        with open(ver_file, "w") as fo:
-            fo.write(version + os.linesep)
+        with open(ver_file, "w") as fopen:
+            fopen.write(version + os.linesep)
 
     @staticmethod
     def generate_docker_tags(

@@ -34,11 +34,12 @@ ENV \
     MAKEFLAGS="-j2"
 
 RUN \
-    apt-get update --fix-missing && apt-get install -y wget && \
-    apt-get update -qq --fix-missing && \
-    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
     CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
+    apt-get update -qq --fix-missing && apt-get install -y wget && \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    echo "NCCL version found: $NCCL_VER" && \
     TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
+    echo "NCCL version to install: $TO_INSTALL_NCCL" && \
     apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \
@@ -96,7 +97,7 @@ RUN \
       --extra-index-url="https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM//'.'/''}/"
 
 RUN \
-    # Show what we have
+    # Show what we have \
     pip --version && \
     pip list && \
     python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \

@@ -418,7 +418,7 @@ def load(self, state: Optional[Mapping], path: str) -> None:
         """Loads a checkpoint from a given file into state.
 
         Args:
-            state: a mapping contaning model, optimizer and lr scheduler
+            state: a mapping containing model, optimizer and lr scheduler
             path: the path to load the checkpoint from
 
         """

@@ -30,7 +30,7 @@ def accuracy(predictions, targets):
 def fast_adapt(batch, learner, loss, adaptation_steps, shots, ways):
     data, labels = batch
 
-    # Separate data into adaptation/evalutation sets
+    # Separate data into adaptation/evaluation sets
     adaptation_indices = torch.zeros(data.size(0), dtype=bool)
     adaptation_indices[torch.arange(shots * ways) * 2] = True
     evaluation_indices = ~adaptation_indices

@@ -34,7 +34,7 @@ def fast_adapt(batch, learner, loss, adaptation_steps, shots, ways, device):
     data, labels = batch
     data, labels = data.to(device), labels.to(device)
 
-    # Separate data into adaptation/evalutation sets
+    # Separate data into adaptation/evaluation sets
     adaptation_indices = torch.zeros(data.size(0), dtype=bool)
     adaptation_indices[torch.arange(shots * ways) * 2] = True
     evaluation_indices = ~adaptation_indices

@@ -353,7 +353,7 @@ def generate_trajectory_samples(self) -> tuple[list[torch.Tensor], list[torch.Te
                 # logging
                 self.avg_reward = sum(self.epoch_rewards) / self.steps_per_epoch
 
-                # if epoch ended abruptly, exlude last cut-short episode to prevent stats skewness
+                # if epoch ended abruptly, exclude last cut-short episode to prevent stats skewness
                 epoch_rewards = self.epoch_rewards
                 if not done:
                     epoch_rewards = epoch_rewards[:-1]

@@ -33,7 +33,7 @@ blank = true
 
 [tool.codespell]
 # Todo: enable also python files in a next step
-skip = '*.py'
+#skip = '*.py'
 quiet-level = 3
 # comma separated list of words; waiting for:
 #  https://github.com/codespell-project/codespell/issues/2839#issuecomment-1731601603

@@ -27,7 +27,7 @@
      - for `pytorch-lightning` use `export PACKAGE_NAME=pytorch ; pip install .`
      - for `lightning-fabric` use `export PACKAGE_NAME=fabric ; pip install .`
 
-3. Building packages as sdist or binary wheel and installing or publish to PyPI afterwords you use command
+3. Building packages as sdist or binary wheel and installing or publish to PyPI afterwards you use command
     `python setup.py sdist` or `python setup.py bdist_wheel` accordingly.
    In case you want to build just a particular package you want to set an environment variable:
    `PACKAGE_NAME=lightning|pytorch|fabric python setup.py sdist|bdist_wheel`

@@ -5,5 +5,5 @@
 if not os.path.exists(_VERSION_PATH):
     # relevant for `bdist_wheel`
     _VERSION_PATH = os.path.join(_PACKAGE_ROOT, "version.info")
-with open(_VERSION_PATH, encoding="utf-8") as fo:
-    version = fo.readlines()[0].strip()
+with open(_VERSION_PATH, encoding="utf-8") as fopen:
+    version = fopen.readlines()[0].strip()
@@ -83,7 +83,7 @@ class _Connector:
             1. strategy class
             2. strategy str registered with STRATEGY_REGISTRY
             3. strategy str in _strategy_type enum which listed in each strategy as
-               backend (registed these too, and _strategy_type could be deprecated)
+               backend (registered these too, and _strategy_type could be deprecated)
 
         C. plugins flag could be:
             1. precision class (should be removed, and precision flag should allow user pass classes)

@@ -327,7 +327,7 @@ def setup_optimizers(self, *optimizers: Optimizer) -> Union[_FabricOptimizer, tu
         ``.setup(model, optimizer, ...)`` instead to jointly set them up.
 
         Args:
-            *optimizers: One or more optmizers to set up.
+            *optimizers: One or more optimizers to set up.
 
         Returns:
             The wrapped optimizer(s).

@@ -87,7 +87,7 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
 
     @override
     def reduce_boolean_decision(self, decision: bool, all: bool = True) -> bool:
-        """Reduces a boolean decision over distributed processes. By default is analagous to ``all`` from the standard
+        """Reduces a boolean decision over distributed processes. By default is analogous to ``all`` from the standard
         library, returning ``True`` only if all input decisions evaluate to ``True``. If ``all`` is set to ``False``,
         it behaves like ``any`` instead.
 

@@ -430,7 +430,7 @@ def on_validation_batch_start(
             if self.val_progress_bar_id is not None:
                 self.progress.update(self.val_progress_bar_id, advance=0, visible=False)
 
-            # TODO: remove old tasks when new onces are created
+            # TODO: remove old tasks when new once they are created
             self.val_progress_bar_id = self._add_task(
                 self.total_val_batches_current_dataloader,
                 self.validation_description,

@@ -262,7 +262,7 @@ def current_epoch(self) -> int:
     def global_step(self) -> int:
         """Total training batches seen across all epochs.
 
-        If no Trainer is attached, this propery is 0.
+        If no Trainer is attached, this property is 0.
 
         """
         return self.trainer.global_step if self._trainer else 0

@@ -84,7 +84,7 @@ def __init__(self, dim: int, dropout: float = 0.1, max_len: int = 5000) -> None:
     def forward(self, x: Tensor) -> Tensor:
         if self.pe is None:
             # 1) can't use buffer, see https://github.com/pytorch/pytorch/issues/68407
-            # 2) can't use parameter becauses pe gets sliced and DDP requires all params to participate in forward
+            # 2) can't use parameter because pe gets sliced and DDP requires all params to participate in forward
             # TODO: Could make this a `nn.Parameter` with `requires_grad=False`
             self.pe = self._init_pos_encoding(device=x.device)
 

@@ -93,7 +93,7 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
 
     @override
     def reduce_boolean_decision(self, decision: bool, all: bool = True) -> bool:
-        """Reduces a boolean decision over distributed processes. By default is analagous to ``all`` from the standard
+        """Reduces a boolean decision over distributed processes. By default is analogous to ``all`` from the standard
         library, returning ``True`` only if all input decisions evaluate to ``True``. If ``all`` is set to ``False``,
         it behaves like ``any`` instead.
 

@@ -467,7 +467,7 @@ def _check_strategy_and_fallback(self) -> None:
         if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods():
             raise ValueError(
                 f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
-                f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."
+                f" platform. We recommend `Trainer(strategy='ddp_spawn')` instead."
             )
         if strategy_flag:
             self._strategy_flag = strategy_flag

@@ -5,5 +5,5 @@
 if not os.path.exists(_VERSION_PATH):
     # relevant for `bdist_wheel`
     _VERSION_PATH = os.path.join(_PACKAGE_ROOT, "version.info")
-with open(_VERSION_PATH, encoding="utf-8") as fo:
-    version = fo.readlines()[0].strip()
+with open(_VERSION_PATH, encoding="utf-8") as fopen:
+    version = fopen.readlines()[0].strip()
@@ -5,5 +5,5 @@
 if not os.path.exists(_VERSION_PATH):
     # relevant for `bdist_wheel`
     _VERSION_PATH = os.path.join(_PACKAGE_ROOT, "version.info")
-with open(_VERSION_PATH, encoding="utf-8") as fo:
-    version = fo.readlines()[0].strip()
+with open(_VERSION_PATH, encoding="utf-8") as fopen:
+    version = fopen.readlines()[0].strip()
@@ -126,7 +126,7 @@ def train_fabric_ddp(fabric):
 def run_parity_test(accelerator: str = "cpu", devices: int = 2, tolerance: float = 0.02):
     cuda_reset()
 
-    # Launch processes with Fabric and re-use them for the PyTorch training for convenience
+    # Launch processes with Fabric and reuse them for the PyTorch training for convenience
     fabric = Fabric(accelerator=accelerator, strategy="ddp", devices=devices)
     fabric.launch()
 

@@ -174,7 +174,7 @@ def test_validate_user_settings():
     with pytest.raises(ValueError, match="the number of nodes configured in SLURM .* does not match"):
         env.validate_settings(num_devices=4, num_nodes=1)
 
-    # in interactive mode, validation is skipped becauses processes get launched by Fabric/Trainer, not SLURM
+    # in interactive mode, validation is skipped because processes get launched by Fabric/Trainer, not SLURM
     with mock.patch(
         "lightning.fabric.plugins.environments.slurm.SLURMEnvironment.job_name", return_value="interactive"
     ):

@@ -85,7 +85,7 @@ def test_reapply_compile():
     fabric.launch()
 
     model = BoringModel()
-    # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError:
+    # currently (PyTorch 2.6) using reduce overhead here causes a RuntimeError:
     # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run.
     compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {}
     compiled_model = torch.compile(model, **compile_kwargs)

@@ -412,7 +412,7 @@ def test_reapply_compile():
     fabric.launch()
 
     model = BoringModel()
-    # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError:
+    # currently (PyTorch 2.6) using ruduce-overhead here causes a RuntimeError:
     # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run.
     compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {}
     compiled_model = torch.compile(model, **compile_kwargs)

@@ -194,23 +194,23 @@ def name() -> str:
     class Prec(Precision):
         pass
 
-    class Strat(SingleDeviceStrategy):
+    class TestStrategy(SingleDeviceStrategy):
         pass
 
-    strategy = Strat(device=torch.device("cpu"), accelerator=Accel(), precision=Prec())
+    strategy = TestStrategy(device=torch.device("cpu"), accelerator=Accel(), precision=Prec())
     connector = _Connector(strategy=strategy, devices=2)
     assert isinstance(connector.accelerator, Accel)
-    assert isinstance(connector.strategy, Strat)
+    assert isinstance(connector.strategy, TestStrategy)
     assert isinstance(connector.precision, Prec)
     assert connector.strategy is strategy
 
-    class Strat(DDPStrategy):
+    class TestStrategy(DDPStrategy):
         pass
 
-    strategy = Strat(accelerator=Accel(), precision=Prec())
+    strategy = TestStrategy(accelerator=Accel(), precision=Prec())
     connector = _Connector(strategy=strategy, devices=2)
     assert isinstance(connector.accelerator, Accel)
-    assert isinstance(connector.strategy, Strat)
+    assert isinstance(connector.strategy, TestStrategy)
     assert isinstance(connector.precision, Prec)
     assert connector.strategy is strategy
 

@@ -303,7 +303,7 @@ def test_throughput_monitor_eval(tmp_path, fn):
     assert logger_mock.log_metrics.mock_calls == [
         call(metrics={**expected, f"{fn}|batches": 3, f"{fn}|samples": 9}, step=3),
         call(metrics={**expected, f"{fn}|batches": 6, f"{fn}|samples": 18}, step=6),
-        # the step doesnt repeat
+        # the step doesn't repeat
         call(metrics={**expected, f"{fn}|batches": 9, f"{fn}|samples": 27}, step=9),
         call(metrics={**expected, f"{fn}|batches": 12, f"{fn}|samples": 36}, step=12),
     ]
@@ -326,8 +326,8 @@ def test_model_checkpoint_to_yaml(tmp_path, save_top_k: int):
 
     path_yaml = tmp_path / "best_k_models.yaml"
     checkpoint.to_yaml(path_yaml)
-    with open(path_yaml) as fo:
-        d = yaml.full_load(fo)
+    with open(path_yaml) as fopen:
+        d = yaml.full_load(fopen)
     best_k = dict(checkpoint.best_k_models.items())
     assert d == best_k
 

@@ -45,7 +45,7 @@ def configure_optimizers(self):
 
 
 def test_init_optimizers_resets_lightning_optimizers(tmp_path):
-    """Test that the Trainer resets the `lightning_optimizers` list everytime new optimizers get initialized."""
+    """Test that the Trainer resets the `lightning_optimizers` list every time new optimizers get initialized."""
 
     def compare_optimizers():
         assert trainer.strategy._lightning_optimizers[0].optimizer is trainer.optimizers[0]

@@ -49,7 +49,7 @@ def test_cpu_slurm_save_load(_, tmp_path):
     trainer.fit(model)
     real_global_step = trainer.global_step
 
-    # traning complete
+    # training complete
     assert trainer.state.finished, "cpu model failed to complete"
 
     # predict with trained model before saving

@@ -547,7 +547,7 @@ def test_strict_model_load_more_params(monkeypatch, tmp_path, tmpdir_server, url
     )
     trainer.fit(model)
 
-    # traning complete
+    # training complete
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     # save model
@@ -587,7 +587,7 @@ def test_strict_model_load_less_params(monkeypatch, tmp_path, tmpdir_server, url
     )
     trainer.fit(model)
 
-    # traning complete
+    # training complete
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     # save model

@@ -16,6 +16,7 @@
 from typing import Any, Optional
 from unittest.mock import MagicMock, Mock
 
+import pytest
 import torch
 
 from lightning.fabric.plugins import CheckpointIO, TorchCheckpointIO
@@ -97,6 +98,7 @@ def test_checkpoint_plugin_called(tmp_path):
     checkpoint_plugin.load_checkpoint.assert_called_with(str(tmp_path / "last-v1.ckpt"))
 
 
+@pytest.mark.flaky(reruns=3)
 def test_async_checkpoint_plugin(tmp_path):
     """Ensure that the custom checkpoint IO plugin and torch checkpoint IO plugin is called when async saving and
     loading."""

@@ -230,7 +230,7 @@ def test_fit_twice_raises(mps_count_0):
         barebones=True,
     )
     trainer.fit(model)
-    trainer.test(model)  # make sure testing in between doesnt impact the result
+    trainer.test(model)  # make sure testing in between doesn't impact the result
     trainer.fit_loop.max_epochs += 1
     with pytest.raises(NotImplementedError, match=r"twice.*is not supported"):
         trainer.fit(model)