Merge branch 'master' into docs_finetuning_callback_example

SkafteNicki · web-flow · commit 6496a715e0cf · 2025-10-20T07:20:20.000+02:00
diff --git a/docs/source-pytorch/deploy/production_advanced_2.rst b/docs/source-pytorch/deploy/production_advanced_2.rst
@@ -7,15 +7,20 @@ Deploy models into production (advanced)
 
 ----
 
-*********************************
-Compile your model to TorchScript
-*********************************
-`TorchScript <https://pytorch.org/docs/stable/jit.html>`_ allows you to serialize your models in a way that it can be loaded in non-Python environments.
-The ``LightningModule`` has a handy method :meth:`~lightning.pytorch.core.LightningModule.to_torchscript` that returns a scripted module which you
-can save or directly use.
+************************************
+Export your model with torch.export
+************************************
+
+`torch.export <https://pytorch.org/docs/stable/export.html>`_ is the recommended way to capture PyTorch models for
+deployment in production environments. It produces a clean intermediate representation with strong soundness guarantees,
+making models suitable for inference optimization and cross-platform deployment.
+You can export any ``LightningModule`` using the ``torch.export.export()`` API.
 
 .. testcode:: python
 
+    import torch
+    from torch.export import export
+
     class SimpleModel(LightningModule):
         def __init__(self):
             super().__init__()
@@ -25,25 +30,27 @@ can save or directly use.
             return torch.relu(self.l1(x.view(x.size(0), -1)))
 
 
-    # create the model
+    # create the model and example input
     model = SimpleModel()
-    script = model.to_torchscript()
+    example_input = torch.randn(1, 64)
 
-    # save for use in production environment
-    torch.jit.save(script, "model.pt")
+    # export the model
+    exported_program = export(model, (example_input,))
 
-It is recommended that you install the latest supported version of PyTorch to use this feature without limitations.
+    # save for use in production environment
+    torch.export.save(exported_program, "model.pt2")
 
-Once you have the exported model, you can run it in PyTorch or C++ runtime:
+It is recommended that you install the latest supported version of PyTorch to use this feature without
+limitations. Once you have the exported model, you can load and run it:
 
 .. code-block:: python
 
     inp = torch.rand(1, 64)
-    scripted_module = torch.jit.load("model.pt")
-    output = scripted_module(inp)
+    loaded_program = torch.export.load("model.pt2")
+    output = loaded_program.module()(inp)
 
 
-If you want to script a different method, you can decorate the method with :func:`torch.jit.export`:
+For more complex models, you can also export specific methods by creating a wrapper:
 
 .. code-block:: python
 
@@ -54,7 +61,6 @@ If you want to script a different method, you can decorate the method with :func
             self.dropout = nn.Dropout()
             self.mc_iteration = mc_iteration
 
-        @torch.jit.export
         def predict_step(self, batch, batch_idx):
             # enable Monte Carlo Dropout
             self.dropout.train()
@@ -66,4 +72,11 @@ If you want to script a different method, you can decorate the method with :func
 
 
     model = LitMCdropoutModel(...)
-    script = model.to_torchscript(file_path="model.pt", method="script")
+    example_batch = torch.randn(32, 10)  # example input
+
+    # Export the predict_step method
+    exported_program = torch.export.export(
+        lambda batch, idx: model.predict_step(batch, idx),
+        (example_batch, 0)
+    )
+    torch.export.save(exported_program, "mc_dropout_model.pt2")
diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt
@@ -5,5 +5,5 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
@@ -3,4 +3,4 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py
@@ -37,6 +37,7 @@
 from lightning.fabric.strategies.registry import _StrategyRegistry
 from lightning.fabric.strategies.strategy import _Sharded
 from lightning.fabric.utilities.distributed import log
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_6
 from lightning.fabric.utilities.load import _move_state_into
 from lightning.fabric.utilities.rank_zero import rank_zero_info, rank_zero_warn
 from lightning.fabric.utilities.seed import reset_seed
@@ -47,6 +48,7 @@
     from torch.optim.lr_scheduler import _LRScheduler
 
 _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
+_DEEPSPEED_GREATER_EQUAL_0_16 = RequirementCache("deepspeed>=0.16.0")
 
 
 # TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced.
@@ -239,6 +241,19 @@ def __init__(
                 " Install it by running `pip install -U deepspeed`."
             )
 
+        if _TORCH_GREATER_EQUAL_2_6 and not _DEEPSPEED_GREATER_EQUAL_0_16:
+            # Starting with PyTorch 2.6, `torch.load` defaults to `weights_only=True` when loading full checkpoints.
+            # DeepSpeed added support for this behavior in version 0.16.0.
+            import deepspeed
+
+            deepspeed_version = deepspeed.__version__
+
+            raise ImportError(
+                f"PyTorch >= 2.6 requires DeepSpeed >= 0.16.0. "
+                f"Detected DeepSpeed version: {deepspeed_version}. "
+                "Please upgrade by running `pip install -U 'deepspeed>=0.16.0'`."
+            )
+
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py
@@ -36,5 +36,6 @@
 _TORCH_GREATER_EQUAL_2_4_1 = compare_version("torch", operator.ge, "2.4.1")
 _TORCH_GREATER_EQUAL_2_5 = compare_version("torch", operator.ge, "2.5.0")
 _TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0")
+_TORCH_GREATER_EQUAL_2_6 = compare_version("torch", operator.ge, "2.6.0")
 _TORCHMETRICS_GREATER_EQUAL_1_0_0 = compare_version("torchmetrics", operator.ge, "1.0.0")
 _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -52,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed preventing recursive symlink creation iwhen `save_last='link'` and `save_top_k=-1` ([#21186](https://github.com/Lightning-AI/pytorch-lightning/pull/21186))
 
 
+- Fixed `ModelPruning` sparsity logging bug that caused incorrect sparsity percentages ([#21223](https://github.com/Lightning-AI/pytorch-lightning/pull/21223))
+
+
 - Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))
 
 
diff --git a/src/lightning/pytorch/callbacks/pruning.py b/src/lightning/pytorch/callbacks/pruning.py
@@ -349,7 +349,7 @@ def apply_pruning(self, amount: Union[int, float]) -> None:
     def _log_sparsity_stats(
         self, prev: list[tuple[int, int]], curr: list[tuple[int, int]], amount: Union[int, float] = 0
     ) -> None:
-        total_params = sum(p.numel() for layer, _ in self._parameters_to_prune for p in layer.parameters())
+        total_params = sum(total for _, total in curr)
         prev_total_zeros = sum(zeros for zeros, _ in prev)
         curr_total_zeros = sum(zeros for zeros, _ in curr)
         log.info(
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
@@ -35,10 +35,12 @@
 from lightning.fabric.strategies import _StrategyRegistry
 from lightning.fabric.strategies.deepspeed import (
     _DEEPSPEED_AVAILABLE,
+    _DEEPSPEED_GREATER_EQUAL_0_16,
     _format_precision_config,
     _validate_checkpoint_directory,
     _validate_device_index_selection,
 )
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_6
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import _PATH
@@ -262,6 +264,19 @@ def __init__(
                 " Install it by running `pip install -U deepspeed`."
             )
 
+        if _TORCH_GREATER_EQUAL_2_6 and not _DEEPSPEED_GREATER_EQUAL_0_16:
+            # Starting with PyTorch 2.6, `torch.load` defaults to `weights_only=True` when loading full checkpoints.
+            # DeepSpeed added support for this behavior in version 0.16.0.
+            import deepspeed
+
+            deepspeed_version = deepspeed.__version__
+
+            raise ImportError(
+                f"PyTorch >= 2.6 requires DeepSpeed >= 0.16.0. "
+                f"Detected DeepSpeed version: {deepspeed_version}. "
+                "Please upgrade by running `pip install -U 'deepspeed>=0.16.0'`."
+            )
+
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
@@ -91,8 +91,7 @@ def _generate_sync_fn(self) -> None:
         """Used to compute the syncing function and cache it."""
         fn = self.no_op if self.fn is None or not self.should or self.rank_zero_only else self.fn
         # save the function as `_fn` as the meta are being re-created and the object references need to match.
-        # ignore typing, bad support for `partial`: mypy/issues/1484
-        self._fn: Callable = partial(fn, reduce_op=self.op, group=self.group)  # type: ignore[unused-ignore]
+        self._fn: Callable = partial(fn, reduce_op=self.op, group=self.group)
 
     @property
     def __call__(self) -> Any:
diff --git a/src/lightning/pytorch/utilities/deepspeed.py b/src/lightning/pytorch/utilities/deepspeed.py
@@ -93,10 +93,10 @@ def convert_zero_checkpoint_to_fp32_state_dict(
     ]
     checkpoint_dir = ds_checkpoint_dir(checkpoint_dir)
     optim_files = get_optim_files(checkpoint_dir)
-    optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE)
+    optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE, weights_only=False)
     zero_stage = optim_state["optimizer_state_dict"]["zero_stage"]
     model_file = get_model_state_file(checkpoint_dir, zero_stage)
-    client_state = torch.load(model_file, map_location=CPU_DEVICE)
+    client_state = torch.load(model_file, map_location=CPU_DEVICE, weights_only=False)
     client_state = {key: value for key, value in client_state.items() if key not in deepspeed_states}
     # State dict keys will include reference to wrapper _LightningModuleWrapperBase in old checkpoints created in
     # Lightning version < 2.1. Delete the `_forward_module` prefix before saving.
diff --git a/tests/tests_pytorch/callbacks/test_pruning.py b/tests/tests_pytorch/callbacks/test_pruning.py
@@ -262,13 +262,13 @@ def test_multiple_pruning_callbacks(tmp_path, caplog, make_pruning_permanent: bo
     actual = [m for m in actual if m.startswith("Applied")]
     percentage = r"\(\d+(?:\.\d+)?%\)"
     expected = [
-        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1088 {percentage} -> \d+\/1088 {percentage}",
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}",  # noqa: E501
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}",  # noqa: E501
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/1088 {percentage} -> \d+\/1088 {percentage}",
         rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
         rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
-        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1088 {percentage} -> \d+\/1088 {percentage}",
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
     ]
@@ -329,9 +329,9 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
     actual = [m for m in actual if m.startswith("Applied")]
     percentage = r"\(\d+(?:\.\d+)?%\)"
     expected = [
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/64 {percentage} -> \d+\/64 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/64 {percentage} -> \d+\/64 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/64 {percentage} -> \d+\/64 {percentage}",
     ]
     expected = [re.compile(s) for s in expected]
     assert all(regex.match(s) for s, regex in zip(actual, expected))
@@ -463,3 +463,91 @@ def __init__(self):
         f"Actual weight_orig: {weight_orig}\n"
         f"Max difference: {torch.max(torch.abs(weight_orig - original_weights))}"
     )
+
+
+@pytest.mark.parametrize("pruning_amount", [0.1, 0.2, 0.3, 0.5])
+@pytest.mark.parametrize("model_type", ["simple", "complex"])
+def test_sparsity_calculation(tmp_path, caplog, pruning_amount: float, model_type: str):
+    """Test that the sparsity calculation fix correctly reports percentages."""
+
+    class SimpleModel(BoringModel):
+        """Simple model with 66 parameters (64 weight + 2 bias)."""
+
+        def __init__(self):
+            super().__init__()
+            self.layer = nn.Linear(32, 2)  # 32*2 + 2 = 66 params
+
+    class ComplexModel(BoringModel):
+        """Complex model with multiple layers."""
+
+        def __init__(self):
+            super().__init__()
+            self.layer1 = nn.Linear(32, 64)  # 32*64 + 64 = 2112 params
+            self.layer2 = nn.Linear(64, 2)  # 64*2 + 2 = 130 params
+            # Total: 2112 + 130 = 2242 params (but only layer1 will be pruned)
+            # layer1 params: 2112
+
+        def forward(self, x):
+            x = torch.relu(self.layer1(x))
+            return self.layer2(x)
+
+    if model_type == "simple":
+        model = SimpleModel()
+        expected_total_params = 66
+        parameters_to_prune = None
+    else:
+        model = ComplexModel()
+        expected_total_params = 2112
+        parameters_to_prune = [(model.layer1, "weight"), (model.layer1, "bias")]
+
+    pruning = ModelPruning(
+        pruning_fn="l1_unstructured",
+        parameters_to_prune=parameters_to_prune,
+        amount=pruning_amount,
+        verbose=1,
+        use_global_unstructured=True,
+    )
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
+        logger=False,
+        limit_train_batches=1,
+        max_epochs=1,
+        accelerator="cpu",
+        callbacks=[pruning],
+    )
+
+    with caplog.at_level(INFO):
+        trainer.fit(model)
+
+    sparsity_logs = [msg for msg in caplog.messages if "Applied `L1Unstructured`. Pruned:" in msg]
+    assert len(sparsity_logs) == 1, f"Expected 1 sparsity log, got {len(sparsity_logs)}"
+    sparsity_log = sparsity_logs[0]
+    pattern = r"Applied `L1Unstructured`\. Pruned: \d+/(\d+) \(\d+\.\d+%\) -> (\d+)/(\d+) \((\d+\.\d+)%\)"
+    match = re.search(pattern, sparsity_log)
+    assert match, f"Could not parse sparsity log: {sparsity_log}"
+
+    total_params_before = int(match.group(1))
+    pruned_count = int(match.group(2))
+    total_params_after = int(match.group(3))
+    sparsity_percentage = float(match.group(4))
+    assert total_params_before == expected_total_params, (
+        f"Total parameter count mismatch for {model_type} model. "
+        f"Expected {expected_total_params}, got {total_params_before}"
+    )
+    assert total_params_after == expected_total_params, (
+        f"Total parameter count should be consistent. Before: {total_params_before}, After: {total_params_after}"
+    )
+
+    # Verify sparsity percentage is approximately correct
+    expected_sparsity = pruning_amount * 100
+    tolerance = 5.0
+    assert abs(sparsity_percentage - expected_sparsity) <= tolerance
+
+    # Verify the number of pruned parameters is reasonable
+    expected_pruned_count = int(expected_total_params * pruning_amount)
+    pruned_tolerance = max(1, int(expected_total_params * 0.05))
+    assert abs(pruned_count - expected_pruned_count) <= pruned_tolerance