[tp] improve parallelize_module API to support more cases (pytorch#157182)

wanchaol · pytorchmergebot · commit c7b6c98d1097 · 2025-06-30T18:10:44.000Z
This PR improves the parallelize_module API to support more corner cases: 1. if the plan entry specified as "", it should apply the style to the current module 2. if the plan entry does not have a corresponding submodule to apply, raise a warning and ignore this plan entry As working on this PR, I also found that the while-loop inside is actually not necessary and could produce some nasty on the fly modifying while iterating behavior.. So I removed the while loop Pull Request resolved: pytorch#157182 Approved by: https://github.com/tianyu-l
diff --git a/test/distributed/tensor/debug/test_comm_mode_features.py b/test/distributed/tensor/debug/test_comm_mode_features.py
@@ -144,10 +144,10 @@ def test_MLPStacked_distributed_sharding_display(self):
         model2 = MLPStacked(self.device_type)
 
         parallelize_plan = {
-            "MLPStacked.layers.0.net1": ColwiseParallel(),
-            "MLPStacked.layers.0.net2": RowwiseParallel(),
-            "MLPStacked.layers.1.net1": ColwiseParallel(),
-            "MLPStacked.layers.1.net2": RowwiseParallel(),
+            "layers.0.net1": ColwiseParallel(),
+            "layers.0.net2": RowwiseParallel(),
+            "layers.1.net1": ColwiseParallel(),
+            "layers.1.net2": RowwiseParallel(),
         }
 
         model2 = parallelize_module(model2, device_mesh, parallelize_plan)
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -332,6 +332,49 @@ def test_parallelize_module_multi_wildcard(self):
         )
         self._compare_module(model, model_tp, inp_size, rank0_only=False)
 
+    @with_comms
+    def test_parallelize_module_with_root_module(self):
+        inp_size = [16, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "": PrepareModuleInputOutput(
+                    input_layouts=Replicate(),
+                    desired_input_layouts=Shard(0),
+                    output_layouts=Shard(0),
+                    desired_output_layouts=Replicate(),
+                ),
+                "net1": ColwiseParallel(input_layouts=Shard(0)),
+                "net2": RowwiseParallel(output_layouts=Shard(0)),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    def test_parallelize_module_with_no_match(self):
+        inp_size = [16, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        with self.assertWarns(UserWarning):
+            model_tp = parallelize_module(
+                model_tp,
+                device_mesh,
+                {
+                    "net0.hello.world": ColwiseParallel(),
+                    "net1": ColwiseParallel(),
+                    "net2": RowwiseParallel(),
+                    "net3": ColwiseParallel(),
+                },
+            )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
     @with_comms
     def test_under_devicemesh_context(self):
         # test ColwiseParallel
@@ -357,7 +400,8 @@ def test_empty_plan(self):
         # Call parallelize_module with empty plan.
         # Goal is not to crash.
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        parallelize_module(model, device_mesh)
+        with self.assertWarns(UserWarning):
+            parallelize_module(model, device_mesh)
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
@@ -88,39 +88,53 @@ def parallelize_module(  # type: ignore[return]
         return parallelize_plan._apply(module, device_mesh)
     elif isinstance(parallelize_plan, dict):
         for module_path, parallelize_style in parallelize_plan.items():
+            if module_path == "":
+                # shortcut: empty string means to apply the plan to the current module
+                parallelize_module(module, device_mesh, parallelize_style)
+                continue
+
             path_splits = module_path.split(".")
-            if len(path_splits) == 0:
-                raise ValueError(
-                    "Expect module path to be non-empty, but got empty string!"
-                )
-            while path_splits:
-                atom = path_splits.pop(0)
-                matched_children = filter(
+            # Instead of blindly popping tokens, first check the match,
+            # we only consume/pop the token if we found a match.
+            token = path_splits[0]
+
+            matched_children = list(
+                filter(
                     # `t[0]` is child name
-                    lambda t: fnmatch(t[0], atom),
+                    lambda t: fnmatch(t[0], token),
                     module.named_children(),
                 )
-                # apply the plan to all matched submodules
-                for _, submodule in matched_children:
-                    if path_splits:
-                        # we haven't reached the leaf, apply in dict style
-                        leaf_path = ".".join(
-                            path_splits
-                        )  # rest of the path after `atom`
-                        parallelize_module(
-                            submodule,
-                            device_mesh,
-                            {leaf_path: parallelize_style},
-                            src_data_rank=src_data_rank,
-                        )
-                    else:
-                        # otherwise, directly apply style to this submodule
-                        parallelize_module(
-                            submodule,
-                            device_mesh,
-                            parallelize_style,
-                            src_data_rank=src_data_rank,
-                        )
+            )
+            if not matched_children:
+                # No match at this level. Log a warning and process next plan entry.
+                warnings.warn(
+                    f"Parallelize plan key '{module_path}' could not be resolved: "
+                    f"no submodule matching token '{token}' in module {module}, "
+                    f"skipping this plan entry."
+                )
+                continue
+
+            # Now that we have a match, we can consume the token.
+            path_splits.pop(0)
+            # apply the plan to all matched submodules
+            for _, submodule in matched_children:
+                if path_splits:
+                    # we haven't reached the leaf, apply in dict style
+                    leaf_path = ".".join(path_splits)  # rest of the path after `token`
+                    parallelize_module(
+                        submodule,
+                        device_mesh,
+                        {leaf_path: parallelize_style},
+                        src_data_rank=src_data_rank,
+                    )
+                else:
+                    # otherwise, directly apply style to this submodule
+                    parallelize_module(
+                        submodule,
+                        device_mesh,
+                        parallelize_style,
+                        src_data_rank=src_data_rank,
+                    )
         return module
     else:
         raise TypeError(  # pyre-ignore[7]