modify the way of counting kernels used

roll-away · roll-away · commit ca860b30153d · 2025-11-25T10:41:37.000Z
diff --git a/graph_net/test/naive_decomposer_and_post_extract_process_test.sh b/graph_net/test/naive_decomposer_and_post_extract_process_test.sh
@@ -24,11 +24,7 @@ decorator_config_json_str=$(cat <<EOF
             "group_head_and_tail": true,
             "filter_path":"$GRAPH_NET_ROOT/torch/naive_subgraph_filter.py",
             "filter_config": {},
-            "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process.py",
-            "post_extract_process_config": {
-                "decorator_path": "$GRAPH_NET_ROOT/torch/shape_prop.py",
-                "decorator_class_name": "ShapePropagate"
-            }
+            "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process.py"
         }
     }
 }
diff --git a/graph_net/torch/naive_graph_decomposer.py b/graph_net/torch/naive_graph_decomposer.py
@@ -92,7 +92,7 @@ def forward(self, *args):
         if not self.extracted:
             if self.need_extract(self.submodule, args):
                 self.builtin_extractor(self.submodule, args)
-                self.get_post_extract_process()
+                self._post_extract_process()
             self.extracted = True
         return self.submodule(*args)
 
@@ -101,7 +101,7 @@ def need_extract(self, gm, sample_inputs):
             return True
         return self.filter(gm, sample_inputs)
 
-    def get_post_extract_process(self):
+    def _post_extract_process(self):
         model_path = os.path.join(
             self.parent_graph_extractor.config["output_dir"], self.modelname
         )
@@ -114,7 +114,7 @@ def make_filter(self, config):
         return module.GraphFilter(config["filter_config"])
 
     def make_post_extract_process(self, config):
-        if config["filter_path"] is None:
+        if config["post_extract_process_path"] is None:
             return None
         module = imp_util.load_module(config["post_extract_process_path"])
         return module.PostExtractProcess(config["post_extract_process_config"])
diff --git a/graph_net/torch/naive_subgraph_filter.py b/graph_net/torch/naive_subgraph_filter.py
@@ -3,5 +3,4 @@ def __init__(self, config):
         self.config = config
 
     def __call__(self, gm, sample_inputs):
-        # print(f"GraphFilter\n{gm.code}")
         return True
diff --git a/graph_net/torch/post_extract_process.py b/graph_net/torch/post_extract_process.py
@@ -1,17 +1,8 @@
 from graph_net.torch import utils
-import argparse
 import importlib.util
-import inspect
 import shutil
 import torch
-import logging
-from pathlib import Path
-from typing import Type, Any
-import sys
-import json
-import base64
-from contextlib import contextmanager
-
+from typing import Type
 from torch.profiler import profile, record_function, ProfilerActivity
 
 
@@ -34,7 +25,9 @@ def __call__(self, model_path=None):
         params = inputs_params["weight_info"]
         state_dict = {k: utils.replay_tensor(v) for k, v in params.items()}
 
-        compiled_num_of_kernels = compile_and_count_kernels(model, state_dict)
+        model(**state_dict)
+        compiled_model = torch.compile(model)
+        compiled_num_of_kernels = count_kernels(model, state_dict)
         if compiled_num_of_kernels == 1:
             print(model_path, "can be fully integrated")
             return True
@@ -52,12 +45,12 @@ def load_class_from_file(file_path: str, class_name: str) -> Type[torch.nn.Modul
     return model_class
 
 
-def compile_and_count_kernels(gm, sample_inputs) -> int:
+def count_kernels(model, sample_inputs) -> int:
     """
     Count the number of CUDA kernel launches performed during a model's forward pass.
 
     Args:
-        gm(graph models)
+        model(graph models)
         sample_inputs(tensors)
 
     Returns:
@@ -68,21 +61,19 @@ def compile_and_count_kernels(gm, sample_inputs) -> int:
         - Identifies the event with key = 'cudaLaunchKernel', which corresponds
         to the number of CUDA kernel launches.
     """
-    gm.eval()
+    model.eval()
     # Use PyTorch Profiler
-    compiled_gm = torch.compile(gm)
-    _ = compiled_gm(**sample_inputs)
 
     with profile(
         activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU],
         record_shapes=True,
     ) as prof:
         with record_function("model_inference"):
-            output = compiled_gm(**sample_inputs)
+            output = model(**sample_inputs)
     events = prof.key_averages()
-    if_compile_work = any(e.key == "TorchDynamo Cache Lookup" for e in events)
-    if not if_compile_work:
-        return -1
+
+    total_count = 0
     for e in events:
-        if e.key == "cuLaunchKernel":
-            return e.count
+        if e.key == "cuLaunchKernel" or e.key == "cudalaunchKernel":
+            total_count += e.count
+    return total_count

Original file line number	Diff line number	Diff line change
`@@ -24,11 +24,7 @@ decorator_config_json_str=$(cat <<EOF`
`24`	`24`	`"group_head_and_tail": true,`
`25`	`25`	`"filter_path":"$GRAPH_NET_ROOT/torch/naive_subgraph_filter.py",`
`26`	`26`	`"filter_config": {},`
`27`		`- "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process.py",`
`28`		`- "post_extract_process_config": {`
`29`		`- "decorator_path": "$GRAPH_NET_ROOT/torch/shape_prop.py",`
`30`		`- "decorator_class_name": "ShapePropagate"`
`31`		`- }`
	`27`	`+ "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process.py"`
`32`	`28`	`}`
`33`	`29`	`}`
`34`	`30`	`}`