add sample pass cumsum_num_kernels_generator

lixinqi · lixinqi · commit 9b6af4c2a53f · 2025-12-21T12:45:54.000Z
diff --git a/graph_net/test/cumsum_num_kernels_test.sh b/graph_net/test/cumsum_num_kernels_test.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))")
+
+python3 -m graph_net.model_path_handler \
+    --model-path-list "$GRAPH_NET_ROOT/graph_net/test/dev_model_list/cumsum_num_kernels_sample_list.txt" \
+    --handler-config $(base64 -w 0 <<EOF
+{
+    "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/sample_passes/cumsum_num_kernels_generator.py",
+    "handler_class_name": "CumSumNumKernelsGenerator",
+    "handler_config": {
+        "output_json_file_name": "cumsum_num_kernels.json",
+        "resume": false,
+        "model_path_prefix": "$GRAPH_NET_ROOT",
+        "output_dir": "/tmp/cumsum_num_kernels_workspace"
+    }
+}
+EOF
+)
diff --git a/graph_net/test/dev_model_list/cumsum_num_kernels_sample_list.txt b/graph_net/test/dev_model_list/cumsum_num_kernels_sample_list.txt
@@ -0,0 +1 @@
+samples/timm/resnet18
diff --git a/graph_net/torch/sample_passes/cumsum_num_kernels_generator.py b/graph_net/torch/sample_passes/cumsum_num_kernels_generator.py
@@ -0,0 +1,97 @@
+from graph_net.sample_pass.sample_pass import SamplePass
+from graph_net.sample_pass.resumable_sample_pass_mixin import ResumableSamplePassMixin
+from graph_net.torch.fx_graph_cache_util import (
+    parse_immutable_model_path_into_sole_graph_module,
+)
+from graph_net.torch.decompose_util import convert_to_submodules_graph
+from graph_net.torch.count_kernels_util import count_kernels
+from graph_net.torch.fx_graph_module_util import (
+    get_fx_graph_num_ops,
+    get_torch_module_and_inputs,
+)
+from pathlib import Path
+import json
+import torch
+
+
+class CumSumNumKernelsGenerator(SamplePass, ResumableSamplePassMixin):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def declare_config(
+        self,
+        model_path_prefix: str,
+        output_dir: str,
+        resume: bool = False,
+        limits_handled_models: int = None,
+        output_json_file_name: str = "cumsum_num_kernels.json",
+    ):
+        pass
+
+    def __call__(self, rel_model_path: str):
+        self.resumable_handle_sample(rel_model_path)
+
+    def sample_handled(self, rel_model_path: str) -> bool:
+        file_name = self.config["output_json_file_name"]
+        return self.naive_sample_handled(rel_model_path, search_file_name=file_name)
+
+    def resume(self, rel_model_path: str):
+        model_path = Path(self.config["model_path_prefix"]) / rel_model_path
+        analyzer = CumsumNumKernelsAnalyzer(model_path)
+        cumsum_num_kernels = analyzer.analyze()
+        cumsum_num_kernels_json = json.dumps(cumsum_num_kernels, indent=4)
+        output_dir_path = Path(self.config["output_dir"]) / rel_model_path
+        (output_dir_path / self.config["output_json_file_name"]).write_text(
+            cumsum_num_kernels_json
+        )
+
+
+class CumsumNumKernelsAnalyzer:
+    def __init__(self, model_path: Path):
+        self.model_path = model_path
+
+    def analyze(self):
+        triples = list(self._get_cumsum_num_kernels())
+        data = {
+            "range_and_num_kernels": [
+                ((start, end), num_kernels) for start, end, num_kernels in triples
+            ],
+        }
+        return data
+
+    def _get_cumsum_num_kernels(self):
+        model_path = str(self.model_path)
+        module, inputs = get_torch_module_and_inputs(model_path, use_dummy_inputs=False)
+        gm = parse_immutable_model_path_into_sole_graph_module(model_path)
+        for start, end in self._get_ranges(gm):
+            assert start == 0
+            num_kernels = self._get_num_kernels_if_submodule_compiled(
+                graph_module=gm,
+                nn_module=module,
+                inputs=inputs,
+                submodule_start=start,
+                submodule_end=end,
+            )
+            print(f"subgraph_range=[{start}, {end})\t{num_kernels=}")
+            yield start, end, num_kernels
+
+    def _get_num_kernels_if_submodule_compiled(
+        self, graph_module, nn_module, inputs, submodule_start, submodule_end
+    ):
+        torch.cuda.empty_cache()
+        rewrited_gm: torch.fx.GraphModule = convert_to_submodules_graph(
+            graph_module,
+            submodule_hook=lambda m, seq_no: torch.compile(m),
+            split_positions=[submodule_start, submodule_end],
+            subgraph_ranges=[(submodule_start, submodule_end)],
+            group_head_and_tail=False,
+            chain_style=False,
+        )
+        _, num_kernels = count_kernels(rewrited_gm, inputs)
+        return num_kernels
+
+    def _get_ranges(self, gm):
+        num_ops = get_fx_graph_num_ops(gm)
+        for i in range(num_ops):
+            cum_num_ops = i + 1
+            yield 0, cum_num_ops