disable reshard_after_forward in top level module (#1009)

JKSenthil · facebook-github-bot · commit f1ebb6378fec · 2025-06-11T15:28:04.000-07:00
Summary: Pull Request resolved: #1009 Reviewed By: richardwang-at-fb Differential Revision: D76364770 fbshipit-source-id: 0c5eb22617ca7d903b2db7b159b2ea31f76e7244
diff --git a/tests/utils/test_prepare_module.py b/tests/utils/test_prepare_module.py
@@ -293,7 +293,7 @@ def test_fsdp2_mesh(self, mock_fully_shard: Mock) -> None:
             global_mesh=mock_global_mesh,
         )
         mock_fully_shard.assert_called_with(
-            module, mesh=mock_mesh, reshard_after_forward=True
+            module, mesh=mock_mesh, reshard_after_forward=False
         )
 
     @patch("torchtnt.utils.prepare_module._prepare_module_2d")
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -7,6 +7,7 @@
 # pyre-strict
 
 import logging
+from copy import deepcopy
 from dataclasses import asdict, dataclass, field
 from functools import partial
 from typing import (
@@ -468,7 +469,13 @@ def prepare_fsdp2(
 
     # shard the top level model, so that all params are moved off cpu to gpu
     if not _is_fsdp2_module(module):
-        fully_shard(module, **fsdp_kwargs)
+        # disable reshard_after_forward for top level module
+        # as result is DTensor which may be incompatible with
+        # certain loss computation
+        root_kwargs = deepcopy(fsdp_kwargs)
+        root_kwargs["reshard_after_forward"] = False
+
+        fully_shard(module, **root_kwargs)
 
     # materialized sharded meta weights to device
     materialize_meta_params(module, device)

Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@ def test_fsdp2_mesh(self, mock_fully_shard: Mock) -> None:`
`293`	`293`	`global_mesh=mock_global_mesh,`
`294`	`294`	`)`
`295`	`295`	`mock_fully_shard.assert_called_with(`
`296`		`- module, mesh=mock_mesh, reshard_after_forward=True`
	`296`	`+ module, mesh=mock_mesh, reshard_after_forward=False`
`297`	`297`	`)`
`298`	`298`
`299`	`299`	`@patch("torchtnt.utils.prepare_module._prepare_module_2d")`