NVIDIA
diff --git a/‎megatron/core/distributed/fsdp/src/README.md‎
Lines changed: 99 additions & 32 deletions b/‎megatron/core/distributed/fsdp/src/README.md‎
Lines changed: 99 additions & 32 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py‎
Lines changed: 220 additions & 166 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py‎
Lines changed: 220 additions & 166 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py‎
Lines changed: 6 additions & 0 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py‎
Lines changed: 3 additions & 18 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py‎
Lines changed: 3 additions & 18 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py‎
Lines changed: 21 additions & 1 deletion b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py‎
Lines changed: 48 additions & 28 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py‎
Lines changed: 48 additions & 28 deletions
@@ -283,8 +283,14 @@ def __init__(
         self._register_fsdp_hooks(self.module)
         self.microbatch_count = 0
 
+        # Add a reference from the distributed parameters to self for API
+        # accessibility, e.g. when attaching MegatronFSDP scheduled ops
+        # to the distributed optimizer.step() and optimizer.zero_grad().
         self.is_param_fsdp_distributed = False
         self._replace_param_with_distributed_if_needed()
+        for param in self.module.parameters():
+            # Attach MegatronFSDP reference to the parameter.
+            setattr(param, "_megatron_fsdp_model", self)
 
     def _check_module_parameter_types(self):
         """
 
@@ -31,7 +31,6 @@
 import torch
 from torch.distributed import _coalescing_manager
 from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.distributed.tensor.device_mesh import _mesh_resources
 
 from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor
 from .utils import _MODEL_PARALLEL_RNG_TRACKER_NAME, FSDPDistributedIndex, get_global_memory_buffer
@@ -94,7 +93,7 @@ def _p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
     message ``s`` since otherwise, it is swallowed.
     """
     if not cond:
-        print(s)
+        logger.warning(s)
         traceback.print_stack()
         if raise_assertion_error:
             raise AssertionError(s)
@@ -205,7 +204,7 @@ def __exit__(self, *args):
         for group in self.groups[1:]:
             backend = group._get_backend(torch.device("cuda", torch.cuda.current_device()))
             if torch.distributed.get_rank() == 0:
-                print(
+                logger.info(
                     f"[MultiGroupUBRAllocator] Registering mem pool to group {group}, "
                     f"group.group_desc:{group.group_desc}"
                 )
@@ -3525,20 +3524,6 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa
     if isinstance(param, DTensor) and cast(DTensor, param)._spec.num_shards > 1:
         # Retrieve original DTensorSpec (for TP).
         dtensor_spec = cast(DTensor, param)._spec
-        dtensor_mesh = getattr(dtensor_spec, "mesh", None)
-
-        # Validate that the DTensor root mesh is identical to the Megatron-FSDP device mesh.
-        megatron_fsdp_global_mesh = dist_index.get_root_mesh()
-        dtensor_global_mesh = _mesh_resources.get_root_mesh(dtensor_mesh)
-        # FIXME(boxiangw): add or megatron_fsdp_global_mesh != dtensor_global_mesh:
-        # _mesh_resources.get_root_mesh(dtensor_mesh) is not getting the correct root mesh
-        if dtensor_global_mesh is None:
-            raise ValueError(
-                f"When utilizing DTensor-based modules with Megatron-FSDP, the DTensor root "
-                f"device mesh must be identical to the Megatron-FSDP root device mesh.\n"
-                f"DTensor Root Mesh: {dtensor_global_mesh} / Megatron-FSDP "
-                f"Root Mesh: {megatron_fsdp_global_mesh}"
-            )
 
         # Get the placements for the parameter.
         assert len(dtensor_spec.placements) == 1, (
@@ -3724,7 +3709,7 @@ def make_fsdp_dtensor(
                 device_mesh=tp_mesh,
                 placements=[Shard(tp_dim)],
                 run_check=run_check,
-                shape=global_shape,
+                shape=tuple(global_shape),
                 stride=torch.empty(global_shape).stride(),
             )
 
 
@@ -25,6 +25,8 @@
 from torch.distributed.checkpoint.planner import TensorWriteData, WriteItem, WriteItemType
 from torch.distributed.tensor.placement_types import Replicate, Shard, _StridedShard
 
+from .utils import get_mesh_names
+
 
 def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageMetadata:
     """
@@ -272,7 +274,25 @@ def gather_uneven_dtensor_to_full_tensor(
     if not device_mesh.mesh_dim_names:
         process_group = device_mesh.get_group()
     else:
-        process_group = device_mesh._flatten().get_group()
+        # Check if the fully-flattened mesh exists first.
+        full_flattened_mesh_dim_name = "_".join(device_mesh.mesh_dim_names)
+        if full_flattened_mesh_dim_name in get_mesh_names(device_mesh):
+            # Retrieve the existing flattened DeviceMesh ProcessGroup.
+            try:
+                # Two Cases: Name is a root dimension, or using the old DeviceMesh
+                # API which allows us to get flattened dimensions.
+                process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
+            except:
+                # Name is a flattened dimension that cannot be retrieved from the
+                # DeviceMesh.__getitem__, so fall-back to new DeviceMesh API.
+                process_group = (
+                    device_mesh._get_root_mesh()
+                    ._flatten_mapping[full_flattened_mesh_dim_name]
+                    .get_group()
+                )
+        else:
+            # Create the _-separated flattened DeviceMesh ProcessGroup.
+            process_group = device_mesh._flatten().get_group()
 
     # Collect chunk metadata for uneven shards (update if missing)
     if not hasattr(dtensor._local_tensor, "__create_chunk_list__"):
 
@@ -34,7 +34,6 @@
 from torch.cuda import _lazy_call, _lazy_init
 from torch.cuda import device as device_ctx_manager
 from torch.distributed import DeviceMesh, ProcessGroup
-from torch.distributed.device_mesh import _mesh_resources
 
 logger = logging.getLogger(__name__)
 
@@ -150,30 +149,50 @@ def is_float8tensor(tensor: torch.Tensor) -> bool:
     return HAVE_TE_FP8_TENSOR_CLASS and isinstance(tensor, FP8_TENSOR_CLASS)
 
 
-def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]:
+def get_mesh_names(
+    device_mesh: Optional[DeviceMesh] = None, only_submesh_dims: bool = False
+) -> list[str]:
     """
-    Get all the sub-mesh names in the DeviceMesh.
+    Get all the sub-mesh ("dp", "cp", etc.) and flattened-mesh ("dp_cp", etc.) names
+    in the DeviceMesh. When only_submesh_dims=True, only checks for sub-mesh dimensions.
     """
     if device_mesh is None:
         # Device mesh does not exist.
         return []
-    # Order of the returned list of mesh dimension names must match the order / index
-    # of the root mesh dimension names followed by children / flattened sub-meshes:
-    # [<root mesh dimension names>, <child mesh dimension names>]
-    mesh_dim_names = (
+
+    # Sub-mesh dimension names.
+    submesh_dim_names = (
         list(device_mesh.mesh_dim_names) if device_mesh.mesh_dim_names is not None else []
     )
-    submesh_dim_names = [
-        submesh_dim_name
-        for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
-        for submesh_dim_name in (child_mesh.mesh_dim_names or [])
-        if root_mesh == device_mesh
-    ]
-    # Combine without duplicate dimensions.
-    for dim_name in submesh_dim_names:
-        if dim_name not in mesh_dim_names:
-            mesh_dim_names.append(dim_name)
-    return mesh_dim_names
+
+    # Flattened mesh dimension names.
+    try:
+        # Retrieve all flattened meshes associated with DeviceMesh.
+        # The flattened DeviceMesh are all located in the _flatten_mapping
+        # dictionary of the root DeviceMesh.
+        flatten_mesh_names = [
+            flat_dim
+            for flat_dim, flat_mesh in device_mesh._get_root_mesh()._flatten_mapping.items()
+        ]
+    except AttributeError:
+        # Fallback to the DeviceMesh global state to retrieve flattened
+        # meshes associated with the DeviceMesh.
+        from torch.distributed.device_mesh import _mesh_resources
+
+        flatten_mesh_names = [
+            child_mesh_dim_name
+            for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
+            for child_mesh_dim_name in (child_mesh.mesh_dim_names or [])
+            if root_mesh == device_mesh and child_mesh_dim_name not in submesh_dim_names
+        ]
+
+    # Order of the returned list of mesh dimension names must match the index
+    # of the root mesh dimension names followed by flattened sub-meshes:
+    # [<root mesh dimension names>, <flattened mesh dimension names>]
+    if only_submesh_dims:
+        return submesh_dim_names
+    else:
+        return submesh_dim_names + flatten_mesh_names
 
 
 def contains_submesh(
@@ -720,16 +739,14 @@ def __init__(
         self.hybrid_fsdp_group = hybrid_fsdp_group
 
         """
-        Store a persistent reference to the core device meshes that back Megatron-FSDP.
-        This is necessary because _MeshEnv (_mesh_resources) may not persist:
-            - _mesh_resources.child_to_root_mapping
-            - _mesh_resources.root_to_flatten_mapping
-            - _mesh_resources.flatten_name_to_root_dims
-            - ...
-        during Torch Autograd, so child and flattened sub-meshes may be cleared.
-        For example, this breaks Megatron-FSDP when self.dp_shard_dim is the flattened
-        sub-mesh of the DP and CP root mesh dimensions.
-        FIXME(@cspades): Identify the root cause of this behavior.
+        Megatron-FSDP is responsible for storing all required DeviceMesh
+        as per best practices recommended by the DeviceMesh API.
+
+        NOTE(@cspades): In PyTorch 2.11, retrieving flattened mesh dimensions
+        will be impossible via the device_mesh[...] API. We will require all
+        users to correctly _unflatten() their DeviceMesh such that all
+        dimensions used by Megatron-FSDP are sub-meshes of the DeviceMesh.
+        contains_submesh(...) -> get_mesh_names(only_submesh_dims=True).
         """
         self.mesh_library = {}
         # TP Mesh
@@ -825,6 +842,9 @@ def get_outer_fsdp_group(self) -> ProcessGroup:
 
     def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh:
         """Get the device mesh."""
+        # NOTE(@cspades): This is FSDPDistributedIndex's root mesh, NOT the actual
+        # root mesh that the DeviceMesh or expert DeviceMesh was un-flattened from.
+        # To get the root mesh, use: DeviceMesh._get_root_mesh().
         if is_expert_parallel:
             raise NotImplementedError("Expert parallel is not supported in Megatron-FSDP.")
         return self.device_mesh