add comments for addressing feedback

naomili0924 · naomili0924 · commit 4f08f96aef2b · 2025-11-14T02:56:57.000Z
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -1627,6 +1627,10 @@ def _get_tensor_address(p):
         for obj in result:
             if isinstance(obj, torch.optim.Optimizer):
                 for param_group in obj.param_groups:
+                    # Each param_group originally maps to model parameters (e.g., from model.parameters()).
+                    # After _prepare_tp(), parameter references are replaced with DTensor instances.
+                    # Therefore, we remap the parameter references to their new DTensor addresses
+                    # so that the optimizer can correctly update the model parameters.
                     param_group["params"] = [mapping[_get_tensor_address(p)] for p in param_group["params"]]
 
         return args
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -507,6 +507,9 @@ def _cast_and_contiguous(tensor, to_contiguous, dtype):
             device_mesh = sharded_param.device_mesh
             full_param = full_param.detach().to(device_mesh.device_type)
             if isinstance(full_param, DTensor):
+                # dist.broadcast() only supports torch.Tensor.
+                # After prepare_tp(), model parameters may become DTensor.
+                # To broadcast such a parameter, convert it to a local tensor first.
                 full_param = full_param.to_local()
             dist.broadcast(full_param, src=0, group=dist.group.WORLD)
             sharded_tensor = distribute_tensor(full_param, device_mesh, sharded_param.placements)