ScalingIntelligence · simonguozirui · Jan 8, 2026 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/EVAL.md b/EVAL.md
@@ -36,6 +36,9 @@ We have (and continue to) implement various approaches to conduct kernel timing
 
 Check out `timing.py` to see available timing methods and `src/unit_tests/test_eval_timing.py` to test out various timing methods (including leveraging `cuda_event` marker, Triton `do_bench`, `host_time` E2E time). @palic and team is working on a blogpost explaining the different tradeoffs soon. 
 
+### Profiling
+We have experimental profiling support leveraging NVIDIA NCU in `profile.py`.
+
 ### Checkers
 There are potentially many ways model might reward hack and we would like to catch the known ways through checkers [experimental and WIP]. We start with `kernel_static_checker.py`, which is a regex-based checker on the genenrated code against set of rules. We plan to add AST-based, LM-as-a-judge, and more runtime checks in the future. We welcome suggestions and contributions here.
 

diff --git a/README.md b/README.md
@@ -66,13 +66,15 @@ We organize the repo into the following structure:
 KernelBench/
 ├── assets/
 ├── KernelBench/ # Benchmark dataset files
-├── src/ # KernelBench logic code
+├── src/kernelbench/ # KernelBench logic code
 │   ├── unit_tests/  
 │   ├── prompts/
 │   ├── ....
 ├── scripts/ # helpful scripts to run the benchmark
 ├── results/ # baseline times across hardware 
 ├── runs/ # where your runs will be stored
+├── notebooks/ # example notebooks for analysis
+├── pyproject.toml # Project configuration and dependencies
 ```
 
 ## 🔧 Set up

diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ gpu = [
     "nvidia-cutlass-dsl",
     "tilelang",
     "cupy-cuda12x",
+    "nsight-python",
 ]
 dev = [
     "pytest",
@@ -51,4 +52,7 @@ dev = [
 
 [tool.setuptools.packages.find]
 where = ["src"]
-include = ["kernelbench*"]
+include = ["kernelbench*"]
+
+[tool.setuptools.package-data]
+kernelbench = ["prompts/**/*"]
diff --git a/requirements.txt b/requirements.txt
@@ -23,6 +23,7 @@ ninja>=1.13.0
 cupy-cuda12x==13.6.0
 tomli>=2.3.0
 tabulate>=0.9.0
+nsight-python
 
 # Numerics
 einops>=0.8.1

diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py
@@ -6,7 +6,7 @@
     fetch_ref_arch_from_problem_id,
 )
 from kernelbench.timing import (
-    time_execution_with_cuda_event,
+    get_timing_function,
     get_timing_stats,
 )
 from kernelbench.dataset import construct_kernelbench_dataset, fetch_ref_arch_from_dataset
@@ -134,6 +134,7 @@ def measure_program_time(
             ref_arch_name: str,
             ref_arch_src: str, 
             num_trials: int = 100,
+            timing_method: str="cuda_event",
             use_torch_compile: bool = False,
             torch_compile_backend: str="inductor", 
             torch_compile_options: str="default",
@@ -173,9 +174,16 @@ def measure_program_time(
                     print(f"Using PyTorch Eager Execution on {ref_arch_name}")
 
                 model = model.cuda(device=device)
+                timing_func = get_timing_function(timing_method)
                 torch.cuda.synchronize(device=device)
-                elapsed_times = time_execution_with_cuda_event(
-                    model, inputs, num_trials=num_trials, verbose=verbose, device=device
+                elapsed_times = timing_func(
+                    model,
+                    inputs,
+                    num_warmup=3,  # or any default you prefer
+                    num_trials=num_trials,
+                    discard_first=1,  # or 0 to include first trial
+                    verbose=verbose,
+                    device=device,
                 )
                 runtime_stats = get_timing_stats(elapsed_times, device=device)
 
@@ -220,6 +228,7 @@ def record_baseline_times(config: BaselineConfig,
                         ref_arch_name=ref_arch_name,
                         ref_arch_src=ref_arch_src,
                         num_trials=config.num_trials,
+                        timing_method="cuda_event",
                         use_torch_compile=use_torch_compile,
                         torch_compile_backend=torch_compile_backend,
                         torch_compile_options=torch_compile_options,

diff --git a/scripts/get_baseline_time_single_problem.py b/scripts/get_baseline_time_single_problem.py
@@ -2,16 +2,17 @@
 import numpy as np
 from kernelbench.eval import (
     load_original_model_and_inputs,
-    time_execution_with_cuda_event,
-    get_timing_stats,
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
 
+from src.timing import get_timing_function, get_timing_stats
+
 def measure_program_time(
         ref_arch_name: str,
         ref_arch_src: str, 
         num_trials: int = 100,
+        timing_method: str="cuda_event",        
         use_torch_compile: bool = False,
         torch_compile_backend: str="inductor", 
         torch_compile_options: str="default",
@@ -52,8 +53,9 @@ def measure_program_time(
 
             model = model.cuda(device=device)
             torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
-                model, *inputs, num_trials=num_trials, verbose=verbose, device=device
+            timing_func = get_timing_function(timing_method )
+            elapsed_times = timing_func(
+                model, inputs, num_warmup=3, num_trials=num_trials, discard_first=1, verbose=verbose, device=device
             )
             runtime_stats = get_timing_stats(elapsed_times, device=device)
 
@@ -87,5 +89,4 @@ def get_inputs():
 def get_init_inputs():
     return []  # No special initialization inputs needed
     """
-    print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False))
-    print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=True))
+    print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event"))
diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py
@@ -161,7 +161,9 @@ def get_torch_compile_triton(level_num: int, problem_id: int) -> str:
 
 
             torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
+            timing_method = "cuda_event" # use cuda event for timing here
+            time_func_cuda_event = get_timing_function(timing_method)
+            elapsed_times = time_func_cuda_event(
                 model, *inputs, num_trials=1, verbose=False, device=device
             )
             runtime_stats = get_timing_stats(elapsed_times, device=device)