[BE] Type annotate wrapper_benchmark.py and cuda_combined_scheduling.py (pytorch#145542)

BoyuanFeng · pytorchmergebot · commit 58cc6693cb4a · 2025-01-30T03:53:52.000Z
Pull Request resolved: pytorch#145542 Approved by: https://github.com/eellison
diff --git a/torch/_inductor/codegen/cuda_combined_scheduling.py b/torch/_inductor/codegen/cuda_combined_scheduling.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
-from typing import Optional, TYPE_CHECKING, Union
+from typing import Any, List, Optional, Tuple, TYPE_CHECKING, Union
 
 from ..scheduler import (
     BaseSchedulerNode,
@@ -17,12 +17,17 @@
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
+    from typing_extensions import TypeAlias
+
+    from sympy import Expr
 
     import torch
     from torch.utils._ordered_set import OrderedSet
 
     from .common import BackendFeature
 
+    _IntLike: TypeAlias = Union[int, Expr]
+
 
 class CUDACombinedScheduling(BaseScheduling):
     """
@@ -67,15 +72,17 @@ def can_fuse_horizontal(
                 )  # always False at the moment
         return self._triton_scheduling.can_fuse_horizontal(node1, node2)
 
-    def group_fn(self, sizes):
+    def group_fn(
+        self, sizes: Sequence[Sequence[_IntLike]]
+    ) -> tuple[tuple[_IntLike, ...], ...]:
         return self._triton_scheduling.group_fn(sizes)
 
     def codegen_template(
         self,
         template_node: BaseSchedulerNode,
         epilogue_nodes: Sequence[BaseSchedulerNode],
         prologue_nodes: Sequence[BaseSchedulerNode],
-    ):
+    ) -> Optional[str]:
         if self._cuda_cpp_scheduling.is_cuda_cpp_template(template_node):
             assert not epilogue_nodes
             assert not prologue_nodes
@@ -93,28 +100,34 @@ def codegen_template(
                 template_node, epilogue_nodes, prologue_nodes
             )
 
-    def codegen_node(self, node: Union[FusedSchedulerNode, SchedulerNode]):
+    def codegen_node(self, node: Union[FusedSchedulerNode, SchedulerNode]) -> None:
         return self._triton_scheduling.codegen_node(node)
 
-    def codegen_sync(self):
+    def codegen_sync(self) -> None:
         return self._triton_scheduling.codegen_sync()
 
-    def flush(self):
+    def flush(self) -> None:
         return self._triton_scheduling.flush()
 
-    def codegen_combo_kernel(self, *args, **kwargs):
+    def codegen_combo_kernel(self, *args: Any, **kwargs: Any) -> None:
         return self._triton_scheduling.codegen_combo_kernel(*args, **kwargs)
 
-    def benchmark_fused_nodes(self, nodes):
+    def benchmark_fused_nodes(
+        self, nodes: Sequence[BaseSchedulerNode]
+    ) -> Tuple[float, str]:
         return self._triton_scheduling.benchmark_fused_nodes(nodes)
 
     def benchmark_codegened_module(self, module):
         return self._triton_scheduling.benchmark_codegened_module(module)
 
-    def generate_kernel_code_from_nodes(self, nodes, benchmark_kernel=False):
+    def generate_kernel_code_from_nodes(
+        self, nodes: Sequence[Any], benchmark_kernel: bool = False
+    ) -> str:
         return self._triton_scheduling.generate_kernel_code_from_nodes(
             nodes, benchmark_kernel
         )
 
-    def benchmark_combo_kernel(self, node_list):
+    def benchmark_combo_kernel(
+        self, node_list: Sequence[BaseSchedulerNode]
+    ) -> tuple[float, float, List[Optional[str]]]:
         return self._triton_scheduling.benchmark_combo_kernel(node_list)
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -4137,8 +4137,8 @@ def add_multi_kernel_choices(
 
     def benchmark_combo_kernel(self, node_list):
         mod: ModuleType
-        ms: int
-        ms_clone: int
+        ms: float
+        ms_clone: float
 
         def cache_file_path():
             assert mod.__file__ is not None
@@ -4157,7 +4157,7 @@ def store_cache():
                 fd.write(str(ms) + " " + str(ms_clone))
 
         total_ms, file_list = 0, []
-        total_clone_ms = 0
+        total_clone_ms: float = 0.0
         removed_buffers_orig = V.graph.removed_buffers
         V.graph.removed_buffers = OrderedSet(removed_buffers_orig)
         inplaced_to_remove_orig = V.graph.inplaced_to_remove
@@ -4186,7 +4186,7 @@ def store_cache():
             )
             ms, ms_clone = load_cache()
             if ms is not None:
-                total_ms += ms
+                total_ms += ms  # type: ignore[assignment]
                 total_clone_ms += ms_clone
                 file_list.append(mod.__file__)
                 continue
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -3988,7 +3988,7 @@ def _codegen(self) -> None:
 
     def benchmark_combo_kernel(
         self, node_list: Sequence[BaseSchedulerNode]
-    ) -> tuple[float, float, str]:
+    ) -> tuple[float, float, List[Optional[str]]]:
         """
         Benchmark fused list of nodes and return the execution time
         in milliseconds on randomly generated inputs.
@@ -4228,7 +4228,7 @@ def get_fusion_pair_priority(
 
     def benchmark_combo_kernel(
         self, node_list: Sequence[BaseSchedulerNode]
-    ) -> tuple[float, float, str]:
+    ) -> tuple[float, float, List[Optional[str]]]:
         """
         Benchmark the list of nodes to combine and return the execution time
         and memory copy time in milliseconds on randomly generated inputs.
diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
@@ -1,8 +1,9 @@
-# mypy: allow-untyped-defs
 import dataclasses
 import datetime
 import tempfile
 from collections import defaultdict
+from types import ModuleType
+from typing import Any, Dict, Optional, Protocol
 
 import torch
 from torch.autograd import DeviceType
@@ -12,6 +13,11 @@
 from .runtime.runtime_utils import create_bandwidth_info_str, get_num_bytes
 
 
+class BenchmarkCallableType(Protocol):
+    def __call__(self, times: int, repeat: int) -> float:
+        ...
+
+
 _kernel_category_choices = [
     "foreach",
     "persistent_reduction",
@@ -22,7 +28,7 @@
 ]
 
 
-def get_kernel_category_by_source_code(src_code):
+def get_kernel_category_by_source_code(src_code: str) -> str:
     """
     Similar to get_kernel_category but use the source code. Call this API
     if we have not compile the src_code to module yet.
@@ -36,7 +42,7 @@ def get_kernel_category_by_source_code(src_code):
         return "unknown"
 
 
-def get_kernel_category(kernel_mod):
+def get_kernel_category(kernel_mod: ModuleType) -> str:
     """
     Given the module defining a triton kernel, return the category of the kernel.
     Category can be one of:
@@ -54,7 +60,7 @@ def get_kernel_category(kernel_mod):
         return "unknown"
 
 
-def get_triton_kernel(mod):
+def get_triton_kernel(mod: ModuleType):  # type: ignore[no-untyped-def]
     from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 
     cand_list = [
@@ -66,7 +72,9 @@ def get_triton_kernel(mod):
     return cand_list[0]
 
 
-def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
+def benchmark_all_kernels(
+    benchmark_name: str, benchmark_all_configs: Optional[Dict[Any, Any]]
+) -> None:
     """
     An experimental API used only when config.benchmark_kernel is true.
 
@@ -98,7 +106,13 @@ def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
         if num_gb is None:
             num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
 
-        def get_info_str(ms, n_regs, n_spills, shared, prefix=""):
+        def get_info_str(
+            ms: float,
+            n_regs: Optional[Any],
+            n_spills: Optional[Any],
+            shared: Optional[Any],
+            prefix: str = "",
+        ) -> str:
             if not any(x is None for x in [n_regs, n_spills, shared]):
                 kernel_detail_str = (
                     f"  {n_regs:3} regs  {n_spills:3} spills  {shared:8} shared mem"
@@ -156,22 +170,31 @@ class ProfileEvent:
 
 
 def parse_profile_event_list(
-    benchmark_name, event_list, wall_time_ms, nruns, device_name
-):
-    def get_self_device_time(ev):
+    benchmark_name: str,
+    event_list: torch.autograd.profiler_util.EventList,
+    wall_time_ms: float,
+    nruns: int,
+    device_name: str,
+) -> None:
+    def get_self_device_time(
+        ev: torch.autograd.profiler_util.EventList,
+    ) -> float:
         """
         ev.self_device_time_total is in microsecond. Convert to millisecond.
         """
-        return ev.self_device_time_total / 1000 / nruns
+        return ev.self_device_time_total / 1000 / nruns  # type: ignore[attr-defined]
 
-    all_events = defaultdict(list)
+    all_events: Dict[str, list[ProfileEvent]] = defaultdict(list)
 
-    def add_event(ev, category):
+    def add_event(
+        ev: torch.autograd.profiler_util.EventList,
+        category: str,
+    ) -> None:
         profile_ev = ProfileEvent(
             category=category,
-            key=ev.key,
+            key=ev.key,  # type: ignore[attr-defined]
             self_device_time_ms=get_self_device_time(ev),
-            count=ev.count / nruns,  # average across all runs
+            count=ev.count / nruns,  # type: ignore[operator] # average across all runs
         )
         all_events[category].append(profile_ev)
 
@@ -194,7 +217,7 @@ def add_event(ev, category):
 
         add_event(ev, category)
 
-    def report_category(category, profile_events):
+    def report_category(category: str, profile_events: list[ProfileEvent]) -> float:
         if not device_name:
             return 0.0
 
@@ -225,7 +248,7 @@ def report_category(category, profile_events):
         )
         return total_time
 
-    def report():
+    def report() -> None:
         category_list = [
             "triton_pointwise",
             "triton_reduction",
@@ -273,8 +296,12 @@ def report():
 
 
 def perf_profile(
-    wall_time_ms, times, repeat, benchmark_name, benchmark_compiled_module_fn
-):
+    wall_time_ms: float,
+    times: int,
+    repeat: int,
+    benchmark_name: str,
+    benchmark_compiled_module_fn: BenchmarkCallableType,
+) -> None:
     with torch.profiler.profile(record_shapes=True) as p:
         benchmark_compiled_module_fn(times=times, repeat=repeat)
 
@@ -289,7 +316,9 @@ def perf_profile(
     )
 
 
-def ncu_analyzer(benchmark_name, benchmark_compiled_module_fn):
+def ncu_analyzer(
+    benchmark_name: str, benchmark_compiled_module_fn: BenchmarkCallableType
+) -> None:
     import inspect
     import os
     import subprocess
@@ -339,7 +368,9 @@ def ncu_analyzer(benchmark_name, benchmark_compiled_module_fn):
         return
 
 
-def collect_memory_snapshot(benchmark_compiled_module_fn):
+def collect_memory_snapshot(
+    benchmark_compiled_module_fn: BenchmarkCallableType,
+) -> None:
     assert torch.cuda.is_available()
 
     torch.cuda.memory._record_memory_history(max_entries=100000)
@@ -350,7 +381,9 @@ def collect_memory_snapshot(benchmark_compiled_module_fn):
     print(f"The collect memory snapshot has been written to {snapshot_path}")
 
 
-def compiled_module_main(benchmark_name, benchmark_compiled_module_fn):
+def compiled_module_main(
+    benchmark_name: str, benchmark_compiled_module_fn: BenchmarkCallableType
+) -> None:
     """
     This is the function called in __main__ block of a compiled module.
     """