Skip to content

Commit 8727491

Browse files
authored
[Benchmark] Enable CSV output; clean up benchmark hot path (#398)
1 parent 45e9600 commit 8727491

File tree

1 file changed

+20
-26
lines changed

1 file changed

+20
-26
lines changed

benchmarks/run.py

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
$ python benchmarks/run.py --metrics speedup,accuracy --kernel vector_add # Runs vector_add kernel
1212
$ python benchmarks/run.py --metrics speedup,accuracy --kernel vector_add,rms_norm # Runs multiple kernels
1313
$ python benchmarks/run.py --metrics speedup,accuracy # Runs all kernels
14+
15+
# On GPU-1, run first 1/4 of inputs for all kernels and save results to CSV in the current directory
16+
$ CUDA_VISIBLE_DEVICES=1 python benchmarks/run.py --input-shard 1/4 --metrics accuracy,tflops,gbps,speedup --csv --output-dir ./
1417
"""
1518

1619
from __future__ import annotations
@@ -353,17 +356,13 @@ def helion_method(
353356
attr = getattr(mod, attr_name)
354357
if isinstance(attr, Kernel):
355358
attr.reset()
359+
# Force autotuning unless HELION_USE_DEFAULT_CONFIG=1 is set
360+
# This ensures we run autotuning even if the kernel has pre-specified configs
361+
if os.environ.get("HELION_USE_DEFAULT_CONFIG", "0") != "1":
362+
attr.settings.force_autotune = True
356363

357364
def _inner() -> Callable[..., Any] | object:
358-
# Force autotuning unless HELION_USE_DEFAULT_CONFIG=1 is set
359-
# This ensures we run autotuning even if the kernel has pre-specified configs
360-
if os.environ.get("HELION_USE_DEFAULT_CONFIG", "0") != "1":
361-
# Find all Kernel objects in the module and force autotuning
362-
for attr_name in dir(mod):
363-
attr = getattr(mod, attr_name)
364-
if isinstance(attr, Kernel):
365-
attr.settings.force_autotune = True
366-
365+
# BENCHMARK HOT PATH, do not add any new logic here
367366
result = kfunc(*args)
368367
if callable(result):
369368
return result()
@@ -401,15 +400,16 @@ def _inner() -> Callable[..., Any] | object:
401400
file=sys.stderr,
402401
)
403402

404-
# Create and run the operator with unknown args
405-
op = Operator(tb_args=tb_args, extra_args=unknown_args)
403+
from tritonbench.run import _run
406404

407405
# Handle input sharding if requested
408406
if input_shard_info:
409407
shard_idx, total_shards = input_shard_info
410408

411409
# Get the actual number of inputs for this operator
412-
total_inputs = op._available_num_inputs
410+
total_inputs = Operator(
411+
tb_args=tb_args, extra_args=unknown_args
412+
)._available_num_inputs
413413

414414
# Calculate shard boundaries
415415
inputs_per_shard = total_inputs // total_shards
@@ -425,27 +425,21 @@ def _inner() -> Callable[..., Any] | object:
425425
)
426426
shard_size = inputs_per_shard
427427

428-
# Override the operator's input range
429-
op._input_id = start_idx
430-
op._num_inputs = shard_size
431-
432428
print(
433429
f"Running input shard {shard_idx}/{total_shards}: inputs {start_idx} to {start_idx + shard_size - 1} (of {total_inputs} total)",
434430
file=sys.stderr,
435431
)
436432

437-
# Run with proper parameters
438-
warmup = int(getattr(tb_args, "warmup", 25))
439-
rep = int(getattr(tb_args, "iter", 100))
440-
op.run(warmup=warmup, rep=rep)
433+
# Add input-id and num-inputs to the tritonbench args before re-parsing
434+
tritonbench_args.extend(
435+
["--input-id", str(start_idx), "--num-inputs", str(shard_size)]
436+
)
441437

442-
# Print results
443-
print("\nBenchmark Results:", file=sys.stderr)
444-
print(op.output, file=sys.stderr)
438+
# Re-parse args with the new input range
439+
tb_args, unknown_args = tb_parser.parse_known_args(tritonbench_args)
445440

446-
# Clean up memory after running the kernel
447-
# Delete the operator instance which contains all allocated tensors
448-
del op
441+
# Use tritonbench's _run function which handles arg processing
442+
_run(tb_args, unknown_args)
449443

450444
# Force garbage collection multiple times to ensure memory is freed
451445
for _ in range(3):

0 commit comments

Comments
 (0)