Merge branch 'master' into docs/fp16-precision-warning

bhimrazy · web-flow · commit 276d037a045a · 2025-10-20T13:28:49.000+05:45
diff --git a/requirements/doctests.txt b/requirements/doctests.txt
@@ -1,2 +1,2 @@
 pytest ==8.4.2
-pytest-doctestplus ==1.4.0
+pytest-doctestplus ==1.5.0
diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt
@@ -5,5 +5,5 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt
@@ -5,7 +5,7 @@
 matplotlib>3.1, <3.11.0
 omegaconf >=2.2.3, <2.4.0
 hydra-core >=1.2.0, <1.4.0
-jsonargparse[signatures,jsonnet] >=4.39.0, <4.42.0
+jsonargparse[signatures,jsonnet] >=4.39.0, <4.43.0
 rich >=12.3.0, <14.2.0
 tensorboardX >=2.2, <2.7.0  # min version is set by torch.onnx missing attribute
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
@@ -3,4 +3,4 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py
@@ -37,6 +37,7 @@
 from lightning.fabric.strategies.registry import _StrategyRegistry
 from lightning.fabric.strategies.strategy import _Sharded
 from lightning.fabric.utilities.distributed import log
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_6
 from lightning.fabric.utilities.load import _move_state_into
 from lightning.fabric.utilities.rank_zero import rank_zero_info, rank_zero_warn
 from lightning.fabric.utilities.seed import reset_seed
@@ -47,6 +48,7 @@
     from torch.optim.lr_scheduler import _LRScheduler
 
 _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
+_DEEPSPEED_GREATER_EQUAL_0_16 = RequirementCache("deepspeed>=0.16.0")
 
 
 # TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced.
@@ -239,6 +241,19 @@ def __init__(
                 " Install it by running `pip install -U deepspeed`."
             )
 
+        if _TORCH_GREATER_EQUAL_2_6 and not _DEEPSPEED_GREATER_EQUAL_0_16:
+            # Starting with PyTorch 2.6, `torch.load` defaults to `weights_only=True` when loading full checkpoints.
+            # DeepSpeed added support for this behavior in version 0.16.0.
+            import deepspeed
+
+            deepspeed_version = deepspeed.__version__
+
+            raise ImportError(
+                f"PyTorch >= 2.6 requires DeepSpeed >= 0.16.0. "
+                f"Detected DeepSpeed version: {deepspeed_version}. "
+                "Please upgrade by running `pip install -U 'deepspeed>=0.16.0'`."
+            )
+
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py
@@ -36,5 +36,6 @@
 _TORCH_GREATER_EQUAL_2_4_1 = compare_version("torch", operator.ge, "2.4.1")
 _TORCH_GREATER_EQUAL_2_5 = compare_version("torch", operator.ge, "2.5.0")
 _TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0")
+_TORCH_GREATER_EQUAL_2_6 = compare_version("torch", operator.ge, "2.6.0")
 _TORCHMETRICS_GREATER_EQUAL_1_0_0 = compare_version("torchmetrics", operator.ge, "1.0.0")
 _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -52,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed preventing recursive symlink creation iwhen `save_last='link'` and `save_top_k=-1` ([#21186](https://github.com/Lightning-AI/pytorch-lightning/pull/21186))
 
 
+- Fixed `ModelPruning` sparsity logging bug that caused incorrect sparsity percentages ([#21223](https://github.com/Lightning-AI/pytorch-lightning/pull/21223))
+
+
 - Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))
 
 
diff --git a/src/lightning/pytorch/callbacks/pruning.py b/src/lightning/pytorch/callbacks/pruning.py
@@ -349,7 +349,7 @@ def apply_pruning(self, amount: Union[int, float]) -> None:
     def _log_sparsity_stats(
         self, prev: list[tuple[int, int]], curr: list[tuple[int, int]], amount: Union[int, float] = 0
     ) -> None:
-        total_params = sum(p.numel() for layer, _ in self._parameters_to_prune for p in layer.parameters())
+        total_params = sum(total for _, total in curr)
         prev_total_zeros = sum(zeros for zeros, _ in prev)
         curr_total_zeros = sum(zeros for zeros, _ in curr)
         log.info(
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
@@ -35,10 +35,12 @@
 from lightning.fabric.strategies import _StrategyRegistry
 from lightning.fabric.strategies.deepspeed import (
     _DEEPSPEED_AVAILABLE,
+    _DEEPSPEED_GREATER_EQUAL_0_16,
     _format_precision_config,
     _validate_checkpoint_directory,
     _validate_device_index_selection,
 )
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_6
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import _PATH
@@ -262,6 +264,19 @@ def __init__(
                 " Install it by running `pip install -U deepspeed`."
             )
 
+        if _TORCH_GREATER_EQUAL_2_6 and not _DEEPSPEED_GREATER_EQUAL_0_16:
+            # Starting with PyTorch 2.6, `torch.load` defaults to `weights_only=True` when loading full checkpoints.
+            # DeepSpeed added support for this behavior in version 0.16.0.
+            import deepspeed
+
+            deepspeed_version = deepspeed.__version__
+
+            raise ImportError(
+                f"PyTorch >= 2.6 requires DeepSpeed >= 0.16.0. "
+                f"Detected DeepSpeed version: {deepspeed_version}. "
+                "Please upgrade by running `pip install -U 'deepspeed>=0.16.0'`."
+            )
+
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
diff --git a/src/lightning/pytorch/utilities/deepspeed.py b/src/lightning/pytorch/utilities/deepspeed.py
@@ -93,10 +93,10 @@ def convert_zero_checkpoint_to_fp32_state_dict(
     ]
     checkpoint_dir = ds_checkpoint_dir(checkpoint_dir)
     optim_files = get_optim_files(checkpoint_dir)
-    optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE)
+    optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE, weights_only=False)
     zero_stage = optim_state["optimizer_state_dict"]["zero_stage"]
     model_file = get_model_state_file(checkpoint_dir, zero_stage)
-    client_state = torch.load(model_file, map_location=CPU_DEVICE)
+    client_state = torch.load(model_file, map_location=CPU_DEVICE, weights_only=False)
     client_state = {key: value for key, value in client_state.items() if key not in deepspeed_states}
     # State dict keys will include reference to wrapper _LightningModuleWrapperBase in old checkpoints created in
     # Lightning version < 2.1. Delete the `_forward_module` prefix before saving.
diff --git a/tests/tests_pytorch/callbacks/test_pruning.py b/tests/tests_pytorch/callbacks/test_pruning.py
@@ -262,13 +262,13 @@ def test_multiple_pruning_callbacks(tmp_path, caplog, make_pruning_permanent: bo
     actual = [m for m in actual if m.startswith("Applied")]
     percentage = r"\(\d+(?:\.\d+)?%\)"
     expected = [
-        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1088 {percentage} -> \d+\/1088 {percentage}",
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}",  # noqa: E501
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}",  # noqa: E501
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/1088 {percentage} -> \d+\/1088 {percentage}",
         rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
         rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
-        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1088 {percentage} -> \d+\/1088 {percentage}",
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
         rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
     ]
@@ -329,9 +329,9 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
     actual = [m for m in actual if m.startswith("Applied")]
     percentage = r"\(\d+(?:\.\d+)?%\)"
     expected = [
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/64 {percentage} -> \d+\/64 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/64 {percentage} -> \d+\/64 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/64 {percentage} -> \d+\/64 {percentage}",
     ]
     expected = [re.compile(s) for s in expected]
     assert all(regex.match(s) for s, regex in zip(actual, expected))
@@ -463,3 +463,91 @@ def __init__(self):
         f"Actual weight_orig: {weight_orig}\n"
         f"Max difference: {torch.max(torch.abs(weight_orig - original_weights))}"
     )
+
+
+@pytest.mark.parametrize("pruning_amount", [0.1, 0.2, 0.3, 0.5])
+@pytest.mark.parametrize("model_type", ["simple", "complex"])
+def test_sparsity_calculation(tmp_path, caplog, pruning_amount: float, model_type: str):
+    """Test that the sparsity calculation fix correctly reports percentages."""
+
+    class SimpleModel(BoringModel):
+        """Simple model with 66 parameters (64 weight + 2 bias)."""
+
+        def __init__(self):
+            super().__init__()
+            self.layer = nn.Linear(32, 2)  # 32*2 + 2 = 66 params
+
+    class ComplexModel(BoringModel):
+        """Complex model with multiple layers."""
+
+        def __init__(self):
+            super().__init__()
+            self.layer1 = nn.Linear(32, 64)  # 32*64 + 64 = 2112 params
+            self.layer2 = nn.Linear(64, 2)  # 64*2 + 2 = 130 params
+            # Total: 2112 + 130 = 2242 params (but only layer1 will be pruned)
+            # layer1 params: 2112
+
+        def forward(self, x):
+            x = torch.relu(self.layer1(x))
+            return self.layer2(x)
+
+    if model_type == "simple":
+        model = SimpleModel()
+        expected_total_params = 66
+        parameters_to_prune = None
+    else:
+        model = ComplexModel()
+        expected_total_params = 2112
+        parameters_to_prune = [(model.layer1, "weight"), (model.layer1, "bias")]
+
+    pruning = ModelPruning(
+        pruning_fn="l1_unstructured",
+        parameters_to_prune=parameters_to_prune,
+        amount=pruning_amount,
+        verbose=1,
+        use_global_unstructured=True,
+    )
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
+        logger=False,
+        limit_train_batches=1,
+        max_epochs=1,
+        accelerator="cpu",
+        callbacks=[pruning],
+    )
+
+    with caplog.at_level(INFO):
+        trainer.fit(model)
+
+    sparsity_logs = [msg for msg in caplog.messages if "Applied `L1Unstructured`. Pruned:" in msg]
+    assert len(sparsity_logs) == 1, f"Expected 1 sparsity log, got {len(sparsity_logs)}"
+    sparsity_log = sparsity_logs[0]
+    pattern = r"Applied `L1Unstructured`\. Pruned: \d+/(\d+) \(\d+\.\d+%\) -> (\d+)/(\d+) \((\d+\.\d+)%\)"
+    match = re.search(pattern, sparsity_log)
+    assert match, f"Could not parse sparsity log: {sparsity_log}"
+
+    total_params_before = int(match.group(1))
+    pruned_count = int(match.group(2))
+    total_params_after = int(match.group(3))
+    sparsity_percentage = float(match.group(4))
+    assert total_params_before == expected_total_params, (
+        f"Total parameter count mismatch for {model_type} model. "
+        f"Expected {expected_total_params}, got {total_params_before}"
+    )
+    assert total_params_after == expected_total_params, (
+        f"Total parameter count should be consistent. Before: {total_params_before}, After: {total_params_after}"
+    )
+
+    # Verify sparsity percentage is approximately correct
+    expected_sparsity = pruning_amount * 100
+    tolerance = 5.0
+    assert abs(sparsity_percentage - expected_sparsity) <= tolerance
+
+    # Verify the number of pruned parameters is reasonable
+    expected_pruned_count = int(expected_total_params * pruning_amount)
+    pruned_tolerance = max(1, int(expected_total_params * 0.05))
+    assert abs(pruned_count - expected_pruned_count) <= pruned_tolerance

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`pytest ==8.4.2`
`2`		`-pytest-doctestplus ==1.4.0`
	`2`	`+pytest-doctestplus ==1.5.0`