Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2249bef
start metric profile branch
simonguozirui Dec 16, 2025
4716be5
Merge branch 'main' of github-simon:ScalingIntelligence/KernelBench i…
simonguozirui Dec 16, 2025
845dcbb
clean up remaining places for cuda events
simonguozirui Dec 16, 2025
b3387a4
add in nsight-python metric evaluation script and example usage, have…
kesavanramakrishnan Dec 17, 2025
1dd72d7
Merge branch 'main' of https://github.com/ScalingIntelligence/KernelB…
kesavanramakrishnan Jan 6, 2026
9d36c26
merge main
kesavanramakrishnan Jan 6, 2026
daa08cb
add in profiling with nsight python capability, as well as code path …
kesavanramakrishnan Jan 7, 2026
24cc9fc
add in profiling via nsight python
kesavanramakrishnan Jan 7, 2026
ce4469d
move profile.py
kesavanramakrishnan Jan 7, 2026
decf8c6
fixing profiling and timing
kesavanramakrishnan Jan 7, 2026
0de4ce8
Merge branch 'main' of https://github.com/ScalingIntelligence/KernelB…
kesavanramakrishnan Jan 7, 2026
6a16b50
merge in main
kesavanramakrishnan Jan 7, 2026
3b4b342
clean up comments
kesavanramakrishnan Jan 7, 2026
14658ab
reference nsight python
kesavanramakrishnan Jan 7, 2026
e6729ab
clean up
kesavanramakrishnan Jan 7, 2026
7b01845
put wheel pkd data fix in this PR
simonguozirui Jan 7, 2026
977ecb6
update to new nsight version
kesavanramakrishnan Jan 8, 2026
8d9461d
for kesavan to further fix
simonguozirui Jan 8, 2026
a9a2932
clean up profile and make verbose
kesavanramakrishnan Jan 8, 2026
cd64156
add in tests to test_eval_timing
kesavanramakrishnan Jan 8, 2026
95468c3
get ready for merge
simonguozirui Jan 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions EVAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ We have (and continue to) implement various approaches to conduct kernel timing

Check out `timing.py` to see available timing methods and `src/unit_tests/test_eval_timing.py` to test out various timing methods (including leveraging `cuda_event` marker, Triton `do_bench`, `host_time` E2E time). @palic and team is working on a blogpost explaining the different tradeoffs soon.

### Profiling
We have experimental profiling support leveraging NVIDIA NCU in `profile.py`.

### Checkers
There are potentially many ways model might reward hack and we would like to catch the known ways through checkers [experimental and WIP]. We start with `kernel_static_checker.py`, which is a regex-based checker on the genenrated code against set of rules. We plan to add AST-based, LM-as-a-judge, and more runtime checks in the future. We welcome suggestions and contributions here.

Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,15 @@ We organize the repo into the following structure:
KernelBench/
├── assets/
├── KernelBench/ # Benchmark dataset files
├── src/ # KernelBench logic code
├── src/kernelbench/ # KernelBench logic code
│ ├── unit_tests/
│ ├── prompts/
│ ├── ....
├── scripts/ # helpful scripts to run the benchmark
├── results/ # baseline times across hardware
├── runs/ # where your runs will be stored
├── notebooks/ # example notebooks for analysis
├── pyproject.toml # Project configuration and dependencies
```

## 🔧 Set up
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ gpu = [
"nvidia-cutlass-dsl",
"tilelang",
"cupy-cuda12x",
"nsight-python",
]
dev = [
"pytest",
Expand All @@ -51,4 +52,7 @@ dev = [

[tool.setuptools.packages.find]
where = ["src"]
include = ["kernelbench*"]
include = ["kernelbench*"]

[tool.setuptools.package-data]
kernelbench = ["prompts/**/*"]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ ninja>=1.13.0
cupy-cuda12x==13.6.0
tomli>=2.3.0
tabulate>=0.9.0
nsight-python

# Numerics
einops>=0.8.1
Expand Down
15 changes: 12 additions & 3 deletions scripts/generate_baseline_time_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
fetch_ref_arch_from_problem_id,
)
from kernelbench.timing import (
time_execution_with_cuda_event,
get_timing_function,
get_timing_stats,
)
from kernelbench.dataset import construct_kernelbench_dataset, fetch_ref_arch_from_dataset
Expand Down Expand Up @@ -134,6 +134,7 @@ def measure_program_time(
ref_arch_name: str,
ref_arch_src: str,
num_trials: int = 100,
timing_method: str="cuda_event",
use_torch_compile: bool = False,
torch_compile_backend: str="inductor",
torch_compile_options: str="default",
Expand Down Expand Up @@ -173,9 +174,16 @@ def measure_program_time(
print(f"Using PyTorch Eager Execution on {ref_arch_name}")

model = model.cuda(device=device)
timing_func = get_timing_function(timing_method)
torch.cuda.synchronize(device=device)
elapsed_times = time_execution_with_cuda_event(
model, inputs, num_trials=num_trials, verbose=verbose, device=device
elapsed_times = timing_func(
model,
inputs,
num_warmup=3, # or any default you prefer
num_trials=num_trials,
discard_first=1, # or 0 to include first trial
verbose=verbose,
device=device,
)
runtime_stats = get_timing_stats(elapsed_times, device=device)

Expand Down Expand Up @@ -220,6 +228,7 @@ def record_baseline_times(config: BaselineConfig,
ref_arch_name=ref_arch_name,
ref_arch_src=ref_arch_src,
num_trials=config.num_trials,
timing_method="cuda_event",
use_torch_compile=use_torch_compile,
torch_compile_backend=torch_compile_backend,
torch_compile_options=torch_compile_options,
Expand Down
13 changes: 7 additions & 6 deletions scripts/get_baseline_time_single_problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
import numpy as np
from kernelbench.eval import (
load_original_model_and_inputs,
time_execution_with_cuda_event,
get_timing_stats,
set_seed,
fetch_ref_arch_from_problem_id,
)

from src.timing import get_timing_function, get_timing_stats

def measure_program_time(
ref_arch_name: str,
ref_arch_src: str,
num_trials: int = 100,
timing_method: str="cuda_event",
use_torch_compile: bool = False,
torch_compile_backend: str="inductor",
torch_compile_options: str="default",
Expand Down Expand Up @@ -52,8 +53,9 @@ def measure_program_time(

model = model.cuda(device=device)
torch.cuda.synchronize(device=device)
elapsed_times = time_execution_with_cuda_event(
model, *inputs, num_trials=num_trials, verbose=verbose, device=device
timing_func = get_timing_function(timing_method )
elapsed_times = timing_func(
model, inputs, num_warmup=3, num_trials=num_trials, discard_first=1, verbose=verbose, device=device
)
runtime_stats = get_timing_stats(elapsed_times, device=device)

Expand Down Expand Up @@ -87,5 +89,4 @@ def get_inputs():
def get_init_inputs():
return [] # No special initialization inputs needed
"""
print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False))
print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=True))
print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event"))
4 changes: 3 additions & 1 deletion scripts/inspect_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ def get_torch_compile_triton(level_num: int, problem_id: int) -> str:


torch.cuda.synchronize(device=device)
elapsed_times = time_execution_with_cuda_event(
timing_method = "cuda_event" # use cuda event for timing here
time_func_cuda_event = get_timing_function(timing_method)
elapsed_times = time_func_cuda_event(
model, *inputs, num_trials=1, verbose=False, device=device
)
runtime_stats = get_timing_stats(elapsed_times, device=device)
Expand Down
Loading