[5704162] Create a copy to avoid leaking ProcessGroup into state dict (NVIDIA#640)

Fridah-nv · web-flow · commit 7edf59cacc83 · 2025-12-04T11:08:13.000-08:00
## What does this PR do? **Type of change:** ? Bug Fix **Overview:** ? Fixed a TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object error that occurs during checkpoint saving when using modelopt==0.40.0 with Megatron-LM. The ensure_metadata_has_dp_cp_group() function (introduced in NVIDIA#606) modifies the metadata dict in-place by adding a ProcessGroup object. This causes the ProcessGroup to leak into the common_state_dict, which is then broadcast via torch.distributed.broadcast_object_list() during checkpoint validation. Since ProcessGroup objects cannot be pickled, the save fails. Changed ensure_metadata_has_dp_cp_group() to create a new copy of the metadata dict instead of modifying it in-place. ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  --------- Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
diff --git a/modelopt/torch/opt/plugins/megatron.py b/modelopt/torch/opt/plugins/megatron.py
@@ -22,10 +22,36 @@
 import megatron.core.transformer.mlp as megatron_mlp
 import regex as re
 import torch
+from megatron.core.parallel_state import get_data_parallel_group
 
 from ..dynamic import DynamicModule
 
 
+def ensure_metadata_has_dp_cp_group(metadata):
+    """Ensure `metadata` is a dict containing `dp_cp_group` entry.
+
+    This function is adapted from megatron-lm's megatron.core.transformer.utils to avoid
+    dependency on megatron-lm's specific version.
+
+    Note:
+        This is a temporary method and will be removed once this function is merged to
+        megatron.core.transformer.utils in the main branch of megatron-lm.
+    """
+    # Create a copy to avoid modifying the original metadata dict
+    # This prevents ProcessGroup from leaking into state dict
+    if metadata is None:
+        new_metadata = {}
+    else:
+        new_metadata = dict(metadata)
+    if "dp_cp_group" not in new_metadata:
+        try:
+            new_metadata["dp_cp_group"] = get_data_parallel_group(with_context_parallel=True)
+        except (AssertionError, RuntimeError):
+            # Fallback if context parallel is not initialized
+            new_metadata["dp_cp_group"] = get_data_parallel_group()
+    return new_metadata
+
+
 def _modelopt_get_extra_state(self):
     """Populating the extra_state when state_dict() is called.
 
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -33,6 +33,7 @@
 
 from modelopt.torch.opt.plugins.megatron import (
     _MegatronMLP,
+    ensure_metadata_has_dp_cp_group,
     register_modelopt_extra_state_callbacks,
 )
 from modelopt.torch.utils.distributed import ParallelState
@@ -230,30 +231,6 @@ def _register_extra_state_callbacks(model: torch.nn.Module):
 CUSTOM_MODEL_PLUGINS.add(megatron_replace_quant_module_hook)
 
 
-def ensure_metadata_has_dp_cp_group(metadata):
-    """Ensure `metadata` is a dict containing `dp_cp_group` entry.
-
-    If `metadata` is None, a new dict is returned with `dp_cp_group` set.
-    If `metadata` is a dict and missing `dp_cp_group`, it is updated in-place.
-
-    This function is adapted from megatron-lm's megatron.core.transformer.utils to avoid
-    dependency on megatron-lm's specific version.
-
-    Note:
-        This is a temporary method and will be removed once this function is merged to
-        megatron.core.transformer.utils in the main branch of megatron-lm.
-    """
-    if metadata is None:
-        metadata = {}
-    if "dp_cp_group" not in metadata:
-        try:
-            metadata["dp_cp_group"] = get_data_parallel_group(with_context_parallel=True)
-        except (AssertionError, RuntimeError):
-            # Fallback if context parallel is not initialized
-            metadata["dp_cp_group"] = get_data_parallel_group()
-    return metadata
-
-
 class _MegatronParallelLinear(_ParallelLinear):
     _functionals_to_replace = [
         (megatron_parallel, "linear_with_grad_accumulation_and_async_allreduce"),
diff --git a/modelopt/torch/sparsity/weight_sparsity/plugins/megatron.py b/modelopt/torch/sparsity/weight_sparsity/plugins/megatron.py
@@ -16,40 +16,15 @@
 """Support sparsify and save/resore for Megatron."""
 
 import megatron.core.transformer.mlp as megatron_mlp
-from megatron.core.parallel_state import get_data_parallel_group
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 
-from modelopt.torch.opt.plugins.megatron import _MegatronMLP
+from modelopt.torch.opt.plugins.megatron import _MegatronMLP, ensure_metadata_has_dp_cp_group
 
 from ..config import SparseGPTConfig, SparseMagnitudeConfig
 from ..module import SparseModule, SpDMRegistry
 
 
-def ensure_metadata_has_dp_cp_group(metadata):
-    """Ensure `metadata` is a dict containing `dp_cp_group` entry.
-
-    If `metadata` is None, a new dict is returned with `dp_cp_group` set.
-    If `metadata` is a dict and missing `dp_cp_group`, it is updated in-place.
-
-    This function is adapted from megatron-lm's megatron.core.transformer.utils to avoid
-    dependency on megatron-lm's specific version.
-
-    Note:
-        This is a temporary method and will be removed once this function is merged to
-        megatron.core.transformer.utils in the main branch of megatron-lm.
-    """
-    if metadata is None:
-        metadata = {}
-    if "dp_cp_group" not in metadata:
-        try:
-            metadata["dp_cp_group"] = get_data_parallel_group(with_context_parallel=True)
-        except (AssertionError, RuntimeError):
-            # Fallback if context parallel is not initialized
-            metadata["dp_cp_group"] = get_data_parallel_group()
-    return metadata
-
-
 class _MegatronParallelLinear(SparseModule):
     def _get_shard_axis_dict(self, state_dict):
         raise NotImplementedError