backup code

lixinqi · lixinqi · commit 6df0cd09dce9 · 2025-12-09T07:05:27.000Z
diff --git a/graph_net/test/torch_extractor_test.py b/graph_net/test/torch_extractor_test.py
@@ -76,7 +76,7 @@ def forward(self, x):
             start_node_idx=0,
             end_node_idx=2,
             submodule_hook=submodule_hook,
-            # group_head_and_tail=False,
+            group_head_and_tail=True,
         )
         folded_output = folded(inp)
 
diff --git a/graph_net/torch/decompose_util.py b/graph_net/torch/decompose_util.py
@@ -181,7 +181,7 @@ def fold_range_to_submodule(
     end_node_idx: int,
     submodule_hook=None,
     submodule_name="extracted_submodule",
-    group_head_and_tail=True,
+    group_head_and_tail=False,
 ):
     return convert_to_submodules_graph(
         gm,
diff --git a/graph_net/torch/fully_fusible_graph_predicator.py b/graph_net/torch/fully_fusible_graph_predicator.py
@@ -1,10 +1,17 @@
+import torch
 import traceback
 import logging
+from graph_net.imp_util import load_module
+from graph_net.torch.decompose_util import fold_range_to_submodule
 from graph_net.torch.graph_decomposer import NaiveDecomposerExtractor
 from graph_net.torch.graph_fusibility_status import (
     GraphFusibilityStatus,
     GraphFusibility,
 )
+from graph_net.torch.fx_graph_module_util import get_torch_module_and_inputs
+from graph_net.torch.fx_graph_cache_util import (
+    parse_immutable_model_path_into_sole_graph_module,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -32,3 +39,64 @@ def __call__(self, model_path):
             traceback.print_exc()
             print("--------------------------\n")
         return False
+
+
+class FullyFusibleSubGraphPredicator:
+    def __init__(self, config):
+        if config is None:
+            config = {}
+        self.config = self._make_config(config)
+        self.nn_module_fully_fusible_decorator = (
+            self._make_nn_module_fully_fusible_decorator(config)
+        )
+        model_path = self.config["model_path"]
+        module, inputs = get_torch_module_and_inputs(model_path)
+        self.traced_module = parse_immutable_model_path_into_sole_graph_module(
+            model_path
+        )
+        self.inputs = inputs
+
+    def _make_nn_module_fully_fusible_decorator(self, config):
+        py_module = load_module(self.config["nn_module_fully_fusible_decorator_path"])
+        decorator_cls = getattr(
+            py_module, self.config["nn_module_fully_fusible_decorator_class_name"]
+        )
+        return decorator_cls(self.config["nn_module_fully_fusible_decorator_config"])
+
+    def _make_config(
+        self,
+        model_path,
+        nn_module_fully_fusible_decorator_path,
+        nn_module_fully_fusible_decorator_class_name,
+        nn_module_fully_fusible_decorator_config=None,
+    ):
+        if nn_module_fully_fusible_decorator_config is None:
+            nn_module_fully_fusible_decorator_config = {}
+        return {
+            "model_path": model_path,
+            "nn_module_fully_fusible_decorator_path": nn_module_fully_fusible_decorator_path,
+            "nn_module_fully_fusible_decorator_class_name": nn_module_fully_fusible_decorator_class_name,
+            "nn_module_fully_fusible_decorator_config": nn_module_fully_fusible_decorator_config,
+        }
+
+    def __call__(self, gm: torch.fx.GraphModule, start_node_idx, end_node_idx):
+        try:
+            rewrited_gm: torch.fx.GraphModule = fold_range_to_submodule(
+                gm,
+                start_node_idx=start_node_idx,
+                end_node_idx=end_node_idx,
+                submodule_hook=self.nn_module_fully_fusible_decorator,
+            )
+            rewrited_gm(*self.inputs)
+        except GraphFusibilityStatus as status:
+            if status.graph_fusibility == GraphFusibility.kFullyFusible:
+                return True
+            elif status.graph_fusibility == GraphFusibility.kNotFullyFusible:
+                return False
+            else:
+                raise NotImplementedError(f"{status.graph_fusibility=}")
+        except Exception:
+            print("\n--- Custom Error Handler ---")
+            traceback.print_exc()
+            print("--------------------------\n")
+        return False
diff --git a/graph_net/torch/fully_fusible_subgraph_extractor.py b/graph_net/torch/fully_fusible_subgraph_extractor.py
@@ -103,18 +103,28 @@ def __call__(self, rel_model_path):
                 check_fusible_config = self._build_decompose_config(
                     temp_dir, start_pos, end_pos, self.config["model_path_prefix"]
                 )
-                predicator = fully_fusible_graph_predicator.FullyFusibleGraphPredicator(
-                    check_fusible_config
+                predicator_cls = (
+                    fully_fusible_graph_predicator.FullyFusibleGraphPredicator
                 )
+                predicator = predicator_cls(check_fusible_config)
                 logger.warning("fully_fusible_graph_predicator-begin")
                 success = predicator(model_path)
                 logger.warning("fully_fusible_graph_predicator-end")
-                if success:
-                    target_path = self._handle_success(temp_dir, rel_model_path)
-                    print(
-                        f"SUCCESS in finding the biggest fully fusible subgraph. Result saved to: {target_path}"
-                    )
-                    break
+                if not success:
+                    continue
+                decomposer_config = self._build_decompose_config(
+                    temp_dir, start_pos, end_pos, self.config["model_path_prefix"]
+                )
+                predicator_cls = (
+                    fully_fusible_graph_predicator.FullyFusibleGraphPredicator
+                )
+                predicator = predicator_cls(decomposer_config)
+                predicator(model_path)
+                target_path = self._handle_success(temp_dir, rel_model_path)
+                print(
+                    f"SUCCESS in finding the biggest fully fusible subgraph. Result saved to: {target_path}"
+                )
+                break
         else:
             logger.warning("fail to find fully fusible subgraph")
         return gm.forward
diff --git a/graph_net/torch/graph_decomposer.py b/graph_net/torch/graph_decomposer.py
@@ -5,6 +5,9 @@
 from graph_net.torch.extractor import GraphExtractor as BuiltinGraphExtractor
 import graph_net.imp_util as imp_util
 from graph_net.torch.fx_graph_module_util import get_torch_module_and_inputs
+from graph_net.torch.fx_graph_cache_util import (
+    parse_immutable_model_path_into_sole_graph_module,
+)
 from graph_net.torch.fx_graph_parse_util import parse_sole_graph_module
 import logging
 
@@ -134,30 +137,24 @@ def _make_config(
         }
 
     def __call__(self, rel_model_path):
-        # callback = lambda: logger.warning("NaiveDecomposerExtractor-call-end")
-        # logger.warning("NaiveDecomposerExtractor-call-begin")
-        # atexit.register(callback)
         model_path = os.path.join(self.config["model_path_prefix"], rel_model_path)
         config = {
             k: v
             for k, v in self.config.items()
             if k in {"split_positions", "group_head_and_tail", "chain_style"}
         }
-        # logger.warning("get_torch_module_and_inputs_begin")
         module, inputs = get_torch_module_and_inputs(model_path)
-        # logger.warning("get_torch_module_and_inputs_end")
-        # logger.warning("parse_sole_graph_module_begin")
-        gm = parse_sole_graph_module(module, inputs)
-        # logger.warning("parse_sole_graph_module_end")
-        # callback = lambda: logger.warning("convert_to_submodules_graph-call-end")
-        # logger.warning("convert_to_submodules_graph-call-begin")
-        # atexit.register(callback)
-        rewrited_gm: torch.fx.GraphModule = convert_to_submodules_graph(
-            gm,
-            submodule_hook=self.get_naive_decomposer_extractor(model_path),
-            **config,
-        )
-        rewrited_gm(*inputs)
+        gm = parse_immutable_model_path_into_sole_graph_module(model_path)
+        try:
+            logger.warning("convert_to_submodules_graph-call-begin")
+            rewrited_gm: torch.fx.GraphModule = convert_to_submodules_graph(
+                gm,
+                submodule_hook=self.get_naive_decomposer_extractor(model_path),
+                **config,
+            )
+            rewrited_gm(*inputs)
+        finally:
+            logger.warning("convert_to_submodules_graph-call-end")
 
     def get_naive_decomposer_extractor(self, model_path):
         def fn(submodule, seq_no):
diff --git a/graph_net/torch/post_extract_process_count_kernels.py b/graph_net/torch/post_extract_process_count_kernels.py
@@ -12,6 +12,31 @@
 )
 
 
+class TorchNNModuleFullyFusibleDecorator:
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, module):
+        return TorchNNModuleFullyFusiblePredicator(module)
+
+
+class TorchNNModuleFullyFusiblePredicator(torch.nn.Module):
+    def __init__(self, module):
+        self.module = module
+
+    def forward(self, *inputs):
+        try:
+            compiled_model = torch.compile(self.module)
+        except Exception:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+        ret_tensors, compiled_num_of_kernels = count_kernels(compiled_model, inputs)
+        if compiled_num_of_kernels == 1:
+            raise GraphFusibilityStatus(GraphFusibility.kFullyFusible)
+        else:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+        return ret_tensors
+
+
 class ThrowExitStatusIfGraphFullyFusible:
     def __init__(self, config):
         self.config = config
@@ -45,7 +70,7 @@ def __call__(self, model_path=None):
             compiled_model = torch.compile(model)
         except Exception:
             raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
-        compiled_num_of_kernels = count_kernels(compiled_model, state_dict)
+        _, compiled_num_of_kernels = count_kernels(compiled_model, state_dict)
         if compiled_num_of_kernels == 1:
             raise GraphFusibilityStatus(GraphFusibility.kFullyFusible)
         else:
@@ -103,11 +128,17 @@ def count_kernels(model, sample_inputs) -> int:
         record_shapes=True,
     ) as prof:
         with record_function("model_inference"):
-            _ = model(**sample_inputs)
+            if isinstance(sample_inputs, dict):
+                ret_tensors = model(**sample_inputs)
+            elif isinstance(sample_inputs, (list, tuple)):
+                ret_tensors = model(*sample_inputs)
+            else:
+                raise NotImplementedError(f"{type(sample_inputs)=}")
+
     events = prof.key_averages()
 
     total_count = 0
     for e in events:
         if e.key == "cuLaunchKernel" or e.key == "cudaLaunchKernel":
             total_count += e.count
-    return total_count
+    return ret_tensors, total_count

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def forward(self, x):`
`76`	`76`	`start_node_idx=0,`
`77`	`77`	`end_node_idx=2,`
`78`	`78`	`submodule_hook=submodule_hook,`
`79`		`- # group_head_and_tail=False,`
	`79`	`+ group_head_and_tail=True,`
`80`	`80`	`)`
`81`	`81`	`folded_output = folded(inp)`
`82`	`82`