CountNumKernelsNNModule

lixinqi · lixinqi · commit 822fbf8259e0 · 2025-12-21T13:22:16.000Z
diff --git a/graph_net/optional.py b/graph_net/optional.py
@@ -0,0 +1,25 @@
+from typing import TypeVar, Generic, Union
+
+T = TypeVar("T")
+
+
+class Optional(Generic[T]):
+    def __init__(self, value: Union[T, None]):
+        self._value = value
+
+    def reset(self, that):
+        assert isinstance(that, Optional)
+        self._value = that._value
+
+    def is_some(self) -> bool:
+        return self._value is not None
+
+    def unwrap(self) -> T:
+        """Returns the value or raises an error if None."""
+        if self._value is None:
+            raise ValueError("Tried to unwrap a None value!")
+        return self._value
+
+    def unwrap_or(self, default: T) -> T:
+        """Returns the value or a default if None."""
+        return self._value if self._value is not None else default
diff --git a/graph_net/torch/count_kernels_util.py b/graph_net/torch/count_kernels_util.py
@@ -2,6 +2,7 @@
 from graph_net.torch import utils
 import importlib.util
 import torch
+from graph_net.optional import Optional
 import sys
 from typing import Type
 from torch.profiler import profile, record_function, ProfilerActivity
@@ -20,6 +21,21 @@ def __call__(self, module, sub_module_idx):
         return TorchNNModuleFullyFusiblePredicator(module)
 
 
+class CountNumKernelsNNModule(torch.nn.Module):
+    def __init__(self, module, mut_opt_num_kernels: Optional):
+        super().__init__()
+        self.module = module
+        self.compiled_module = torch.compile(self.module)
+        self.mut_opt_num_kernels = mut_opt_num_kernels
+
+    def forward(self, *inputs):
+        ret_tensors, compiled_num_of_kernels = count_kernels(
+            self.compiled_module, inputs
+        )
+        self.mut_opt_num_kernels.reset(Optional(compiled_num_of_kernels))
+        return ret_tensors
+
+
 class TorchNNModuleFullyFusiblePredicator(torch.nn.Module):
     def __init__(self, module):
         super().__init__()
diff --git a/graph_net/torch/sample_passes/cumsum_num_kernels_generator.py b/graph_net/torch/sample_passes/cumsum_num_kernels_generator.py
@@ -1,10 +1,11 @@
 from graph_net.sample_pass.sample_pass import SamplePass
 from graph_net.sample_pass.resumable_sample_pass_mixin import ResumableSamplePassMixin
+from graph_net.optional import Optional
 from graph_net.torch.fx_graph_cache_util import (
     parse_immutable_model_path_into_sole_graph_module,
 )
 from graph_net.torch.decompose_util import convert_to_submodules_graph
-from graph_net.torch.count_kernels_util import count_kernels
+from graph_net.torch.count_kernels_util import CountNumKernelsNNModule
 from graph_net.torch.fx_graph_module_util import (
     get_fx_graph_num_ops,
     get_torch_module_and_inputs,
@@ -41,6 +42,7 @@ def resume(self, rel_model_path: str):
         cumsum_num_kernels = analyzer.analyze()
         cumsum_num_kernels_json = json.dumps(cumsum_num_kernels, indent=4)
         output_dir_path = Path(self.config["output_dir"]) / rel_model_path
+        output_dir_path.mkdir(parents=True, exist_ok=True)
         (output_dir_path / self.config["output_json_file_name"]).write_text(
             cumsum_num_kernels_json
         )
@@ -53,9 +55,9 @@ def __init__(self, model_path: Path):
     def analyze(self):
         triples = list(self._get_cumsum_num_kernels())
         data = {
-            "range_and_num_kernels": [
-                ((start, end), num_kernels) for start, end, num_kernels in triples
-            ],
+            "num_kernels": [num_kernels for start, end, num_kernels in triples],
+            "starts": [start for start, end, num_kernels in triples],
+            "ends": [end for start, end, num_kernels in triples],
         }
         return data
 
@@ -79,16 +81,22 @@ def _get_num_kernels_if_submodule_compiled(
         self, graph_module, nn_module, inputs, submodule_start, submodule_end
     ):
         torch.cuda.empty_cache()
+        mut_opt_num_kernels = Optional(None)
+
+        def compile_and_count_num_kernels(m, seq_no):
+            return CountNumKernelsNNModule(m, mut_opt_num_kernels)
+
         rewrited_gm: torch.fx.GraphModule = convert_to_submodules_graph(
             graph_module,
-            submodule_hook=lambda m, seq_no: torch.compile(m),
+            submodule_hook=compile_and_count_num_kernels,
             split_positions=[submodule_start, submodule_end],
             subgraph_ranges=[(submodule_start, submodule_end)],
             group_head_and_tail=False,
             chain_style=False,
         )
-        _, num_kernels = count_kernels(rewrited_gm, inputs)
-        return num_kernels
+        rewrited_gm(*inputs)
+        assert mut_opt_num_kernels.is_some()
+        return mut_opt_num_kernels.unwrap()
 
     def _get_ranges(self, gm):
         num_ops = get_fx_graph_num_ops(gm)