Look for the biggest fully fusible subgraph (#406)

roll-away · web-flow · commit 0eb02b600999 · 2025-12-03T16:00:54.000+08:00
* 1119

* 1120

* 1120.2

* model_path

* remove unnecessary files and pre-committed

* remove unnecessary files and pre-committed

* 1121 remove unnecessary files

* modify rev version

* modify rev version

* modify rev version

* accuracy issues targeted

* test script and modify feature

* return set[str]

* add logfile for test

* filter can get the number of kernels in naive_graph_decomposer

* post extract process feature

* remove unnecessary code blocks and variables

* modify the way of counting kernels used

* modify the way of counting kernels used

* modify script, rename files and variables

* add failure protection and log output when removing directories

* add a script to check fusability of a given model

* add a script to check if a given model is fully fusable

* add a script to check if a given model is fully fusable

* a script to check if a given model is fully fusable

* add a script to check if a given model is fully fusionable

* add a script to find fully fusionable subgraph

* find the biggest fully fusionable subgraph

* find the biggest fusionable subgraph

* add a script to get the biggest fully fusable subgraph

* use tempfile, fix sys problem, remove unsless configs
diff --git a/graph_net/test/dimension_generalization_test.sh b/graph_net/test/dimension_generalization_test.sh
diff --git a/graph_net/test/graph_decompose_and_look_for_fully_fusable_subgraph_test.sh b/graph_net/test/graph_decompose_and_look_for_fully_fusable_subgraph_test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+
+# input model path
+MODEL_NAME=resnet18
+MODEL_PATH_IN_SAMPLES=/timm/$MODEL_NAME
+decorator_config_json_str=$(cat <<EOF
+{
+    "decorator_path": "$GRAPH_NET_ROOT/torch/extractor.py",
+    "decorator_config": {
+        "name": "$MODEL_NAME",
+        "custom_extractor_path": "$GRAPH_NET_ROOT/torch/fully_fusable_subgraph_extractor.py",
+        "custom_extractor_config": {
+            "split_positions": [],
+            "group_head_and_tail": true,
+            "max_step": 5,
+            "min_step": 2,
+            "max_nodes": 6
+        }
+    }
+}
+EOF
+)
+DECORATOR_CONFIG=$(echo $decorator_config_json_str | base64 -w 0)
+
+python3 -m graph_net.torch.run_model --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --decorator-config=$DECORATOR_CONFIG
diff --git a/graph_net/test/naive_decomposer_and_post_extract_process_test.sh b/graph_net/test/naive_decomposer_and_post_extract_process_test.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# bash graph_net/test/naive_decomposer_and_post_extract_process_test.sh
+
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+
+# input model path
+MODEL_NAME=resnet18
+MODEL_PATH_IN_SAMPLES=/timm/$MODEL_NAME
+decorator_config_json_str=$(cat <<EOF
+{
+    "decorator_path": "$GRAPH_NET_ROOT/torch/extractor.py",
+    "decorator_config": {
+        "name": "$MODEL_NAME",
+        "custom_extractor_path": "$GRAPH_NET_ROOT/torch/naive_graph_decomposer.py",
+        "custom_extractor_config": {
+            "output_dir": "/tmp/naive_decompose_workspace",
+            "split_positions": [8, 16, 32],
+            "group_head_and_tail": true,
+            "filter_path":"$GRAPH_NET_ROOT/torch/naive_subgraph_filter.py",
+            "filter_config": {},
+            "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process_count_kernels.py",
+            "post_extract_process_class_name": "GraphFullyFusable"
+        }
+    }
+}
+EOF
+)
+DECORATOR_CONFIG=$(echo $decorator_config_json_str | base64 -w 0)
+
+python3 -m graph_net.torch.run_model --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --decorator-config=$DECORATOR_CONFIG
diff --git a/graph_net/test/naive_graph_decomposer_test.sh b/graph_net/test/naive_graph_decomposer_test.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-
 GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
 os.path.dirname(graph_net.__file__))")
 
diff --git a/graph_net/torch/fully_fusable_subgraph_extractor.py b/graph_net/torch/fully_fusable_subgraph_extractor.py
@@ -0,0 +1,98 @@
+import os
+import torch
+import graph_net
+import tempfile
+from graph_net.torch import constraint_util
+
+
+class GraphExtractor:
+    def __init__(
+        self,
+        config: dict,
+        name,
+        dynamic,
+        mut_graph_codes=None,
+        placeholder_auto_rename=False,
+    ):
+        self.subgraph_counter = 0
+        self.name = name
+        self.dynamic = dynamic
+        self.mut_graph_codes = mut_graph_codes
+        self.placeholder_auto_rename = placeholder_auto_rename
+        self.config = self.make_config(**config)
+
+    def make_config(
+        self,
+        split_positions=(),
+        group_head_and_tail=False,
+        chain_style=False,
+        max_step=8,
+        min_step=2,
+        max_nodes=32,
+    ):
+        for pos in split_positions:
+            assert isinstance(
+                pos, int
+            ), f"split_positions should be list of int, {split_positions=}"
+        return {
+            "split_positions": split_positions,
+            "group_head_and_tail": group_head_and_tail,
+            "chain_style": chain_style,
+            "max_step": max_step,
+            "min_step": min_step,
+            "max_nodes": max_nodes,
+        }
+
+    def _get_sub_ranges(self):
+        for step in reversed(
+            range(self.config["min_step"], self.config["max_step"] + 1)
+        ):
+            for start_pos in range(self.config["max_nodes"] - step):
+                end_pos = start_pos + step
+                yield start_pos, end_pos
+
+    def __call__(self, gm: torch.fx.GraphModule, sample_inputs):
+        temp_dir_obj = tempfile.TemporaryDirectory(prefix="_check_fusable_")
+        temp_output_dir = temp_dir_obj.name
+        found_fusable_subgraph = False
+        print(f"Using temp output dir: {temp_output_dir}")
+        for start_pos, end_pos in self._get_sub_ranges():
+            self.config["split_positions"] = [start_pos, end_pos]
+            print("current split_positions:", self.config["split_positions"])
+            graph_net_root = os.path.dirname(graph_net.__file__)
+            model_path = os.path.join(
+                graph_net_root, "..", "samples", "timm", self.name
+            )
+            check_fusable_config = {
+                "decorator_path": f"{graph_net_root}/torch/extractor.py",
+                "decorator_config": {
+                    "name": f"{self.name}",
+                    "custom_extractor_path": f"{graph_net_root}/torch/naive_graph_decomposer.py",
+                    "custom_extractor_config": {
+                        "output_dir": temp_output_dir,
+                        "split_positions": self.config["split_positions"],
+                        "group_head_and_tail": False,
+                        "filter_path": f"{graph_net_root}/torch/naive_subgraph_filter.py",
+                        "filter_config": {},
+                        "post_extract_process_path": f"{graph_net_root}/torch/post_extract_process_count_kernels.py",
+                        "post_extract_process_class_name": "GraphFullyFusable",
+                    },
+                },
+            }
+            success = constraint_util.RunModelPredicator(check_fusable_config)(
+                model_path
+            )
+            if success:
+                found_fusable_subgraph = True
+                temp_dir_obj.cleanup = lambda: None
+                print(
+                    f"SUCCESS in finding the biggest fully fusable subgraph saved in: {temp_output_dir}."
+                )
+                break
+            else:
+                print("Failed attempt. clean up the workspace and continue the search.")
+                temp_dir_obj.cleanup()
+                continue
+        if not found_fusable_subgraph:
+            print("No fusable subgraph found")
+        return gm
diff --git a/graph_net/torch/naive_graph_decomposer.py b/graph_net/torch/naive_graph_decomposer.py
@@ -1,8 +1,5 @@
 import os
 import torch
-import shutil
-from typing import Union, Callable
-from graph_net.torch import utils
 from graph_net.torch.decompose_util import convert_to_submodules_graph
 from graph_net.torch.extractor import GraphExtractor as BuiltinGraphExtractor
 import graph_net.imp_util as imp_util
@@ -32,6 +29,8 @@ def make_config(
         output_dir="./tmp/naive_decomposer_dir",
         filter_path=None,
         filter_config=None,
+        post_extract_process_path=None,
+        post_extract_process_class_name=None,
     ):
         for pos in split_positions:
             assert isinstance(
@@ -44,6 +43,8 @@ def make_config(
             "output_dir": output_dir,
             "filter_path": filter_path,
             "filter_config": filter_config if filter_config is not None else {},
+            "post_extract_process_path": post_extract_process_path,
+            "post_extract_process_class_name": post_extract_process_class_name,
         }
 
     def __call__(self, gm: torch.fx.GraphModule, sample_inputs):
@@ -71,6 +72,7 @@ def __init__(self, parent_graph_extractor, submodule, seq_no):
         self.seq_no = seq_no
         self.extracted = False
         name = f"{parent_graph_extractor.name}_{self.seq_no}"
+        self.model_name = name
         self.builtin_extractor = BuiltinGraphExtractor(
             name=name,
             dynamic=False,
@@ -79,21 +81,38 @@ def __init__(self, parent_graph_extractor, submodule, seq_no):
             workspace_path=self.parent_graph_extractor.config["output_dir"],
         )
         self.filter = self.make_filter(self.parent_graph_extractor.config)
+        self.post_extract_process = self.make_post_extract_process(
+            self.parent_graph_extractor.config
+        )
 
     def forward(self, *args):
         if not self.extracted:
             if self.need_extract(self.submodule, args):
                 self.builtin_extractor(self.submodule, args)
             self.extracted = True
+        self._post_extract_process()
         return self.submodule(*args)
 
     def need_extract(self, gm, sample_inputs):
         if self.filter is None:
             return True
         return self.filter(gm, sample_inputs)
 
+    def _post_extract_process(self):
+        model_path = os.path.join(
+            self.parent_graph_extractor.config["output_dir"], self.model_name
+        )
+        return self.post_extract_process(model_path)
+
     def make_filter(self, config):
         if config["filter_path"] is None:
             return None
         module = imp_util.load_module(config["filter_path"])
         return module.GraphFilter(config["filter_config"])
+
+    def make_post_extract_process(self, config):
+        if config["post_extract_process_path"] is None:
+            return None
+        module = imp_util.load_module(config["post_extract_process_path"])
+        cls = getattr(module, config["post_extract_process_class_name"])
+        return cls(config["post_extract_process_path"])
diff --git a/graph_net/torch/naive_subgraph_filter.py b/graph_net/torch/naive_subgraph_filter.py
@@ -3,5 +3,4 @@ def __init__(self, config):
         self.config = config
 
     def __call__(self, gm, sample_inputs):
-        print(f"GraphFilter\n{gm.code}")
         return True
diff --git a/graph_net/torch/post_extract_process_count_kernels.py b/graph_net/torch/post_extract_process_count_kernels.py
@@ -0,0 +1,89 @@
+from graph_net.torch import utils
+import importlib.util
+import torch
+import sys
+from typing import Type
+from torch.profiler import profile, record_function, ProfilerActivity
+
+
+class GraphFullyFusable:
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, model_path=None):
+        torch._dynamo.reset()
+        if model_path is None:
+            sys.exit(1)
+        # model
+        model_class = load_class_from_file(
+            f"{model_path}/model.py", class_name="GraphModule"
+        )
+        assert model_class is not None
+        model = model_class()
+        # print(f"{model_path=}")
+
+        inputs_params = utils.load_converted_from_text(f"{model_path}")
+        params = inputs_params["weight_info"]
+        state_dict = {k: utils.replay_tensor(v) for k, v in params.items()}
+
+        # try to run the model
+        try:
+            model(**state_dict)
+        except Exception as e:
+            print(f"failed in running model:{e}")
+            sys.exit(1)
+        # try to compile the model
+        try:
+            compiled_model = torch.compile(model)
+        except Exception as e:
+            print(f"failed in compiling model:{e}")
+            sys.exit(1)
+        compiled_num_of_kernels = count_kernels(compiled_model, state_dict)
+        if compiled_num_of_kernels == 1:
+            print(model_path, "can be fully integrated!!!!!!!!!!!")
+            sys.exit(0)
+        else:
+            print(f"{model_path} can not be fully integrated, to be removed...")
+            sys.exit(1)
+
+
+def load_class_from_file(file_path: str, class_name: str) -> Type[torch.nn.Module]:
+    spec = importlib.util.spec_from_file_location("unnamed", file_path)
+    unnamed = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(unnamed)
+    model_class = getattr(unnamed, class_name, None)
+    return model_class
+
+
+def count_kernels(model, sample_inputs) -> int:
+    """
+    Count the number of CUDA kernel launches performed during a model's forward pass.
+
+    Args:
+        model(graph models)
+        sample_inputs(tensors)
+
+    Returns:
+        int: The number of kernels used.
+
+    Behavior:
+        - Runs the model once inside a PyTorch profiler context.
+        - Identifies the event with key = 'cudaLaunchKernel', which corresponds
+        to the number of CUDA kernel launches.
+    """
+    model.eval()
+    # Use PyTorch Profiler
+
+    with profile(
+        activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU],
+        record_shapes=True,
+    ) as prof:
+        with record_function("model_inference"):
+            _ = model(**sample_inputs)
+    events = prof.key_averages()
+
+    total_count = 0
+    for e in events:
+        if e.key == "cuLaunchKernel" or e.key == "cudaLaunchKernel":
+            total_count += e.count
+    return total_count
diff --git a/samples/timm/resnet18/input_tensor_constraints.py b/samples/timm/resnet18/input_tensor_constraints.py
@@ -1,4 +1,4 @@
-from sympy import Symbol, Expr, Rel, Eq
+from sympy import Symbol
 
 S0 = Symbol("S0")
 S1 = Symbol("S1")

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from sympy import Symbol, Expr, Rel, Eq`
	`1`	`+from sympy import Symbol`
`2`	`2`
`3`	`3`	`S0 = Symbol("S0")`
`4`	`4`	`S1 = Symbol("S1")`