11
11
$ python benchmarks/run.py --metrics speedup,accuracy --kernel vector_add # Runs vector_add kernel
12
12
$ python benchmarks/run.py --metrics speedup,accuracy --kernel vector_add,rms_norm # Runs multiple kernels
13
13
$ python benchmarks/run.py --metrics speedup,accuracy # Runs all kernels
14
+
15
+ # On GPU-1, run first 1/4 of inputs for all kernels and save results to CSV in the current directory
16
+ $ CUDA_VISIBLE_DEVICES=1 python benchmarks/run.py --input-shard 1/4 --metrics accuracy,tflops,gbps,speedup --csv --output-dir ./
14
17
"""
15
18
16
19
from __future__ import annotations
@@ -353,17 +356,13 @@ def helion_method(
353
356
attr = getattr (mod , attr_name )
354
357
if isinstance (attr , Kernel ):
355
358
attr .reset ()
359
+ # Force autotuning unless HELION_USE_DEFAULT_CONFIG=1 is set
360
+ # This ensures we run autotuning even if the kernel has pre-specified configs
361
+ if os .environ .get ("HELION_USE_DEFAULT_CONFIG" , "0" ) != "1" :
362
+ attr .settings .force_autotune = True
356
363
357
364
def _inner () -> Callable [..., Any ] | object :
358
- # Force autotuning unless HELION_USE_DEFAULT_CONFIG=1 is set
359
- # This ensures we run autotuning even if the kernel has pre-specified configs
360
- if os .environ .get ("HELION_USE_DEFAULT_CONFIG" , "0" ) != "1" :
361
- # Find all Kernel objects in the module and force autotuning
362
- for attr_name in dir (mod ):
363
- attr = getattr (mod , attr_name )
364
- if isinstance (attr , Kernel ):
365
- attr .settings .force_autotune = True
366
-
365
+ # BENCHMARK HOT PATH, do not add any new logic here
367
366
result = kfunc (* args )
368
367
if callable (result ):
369
368
return result ()
@@ -401,15 +400,16 @@ def _inner() -> Callable[..., Any] | object:
401
400
file = sys .stderr ,
402
401
)
403
402
404
- # Create and run the operator with unknown args
405
- op = Operator (tb_args = tb_args , extra_args = unknown_args )
403
+ from tritonbench .run import _run
406
404
407
405
# Handle input sharding if requested
408
406
if input_shard_info :
409
407
shard_idx , total_shards = input_shard_info
410
408
411
409
# Get the actual number of inputs for this operator
412
- total_inputs = op ._available_num_inputs
410
+ total_inputs = Operator (
411
+ tb_args = tb_args , extra_args = unknown_args
412
+ )._available_num_inputs
413
413
414
414
# Calculate shard boundaries
415
415
inputs_per_shard = total_inputs // total_shards
@@ -425,27 +425,21 @@ def _inner() -> Callable[..., Any] | object:
425
425
)
426
426
shard_size = inputs_per_shard
427
427
428
- # Override the operator's input range
429
- op ._input_id = start_idx
430
- op ._num_inputs = shard_size
431
-
432
428
print (
433
429
f"Running input shard { shard_idx } /{ total_shards } : inputs { start_idx } to { start_idx + shard_size - 1 } (of { total_inputs } total)" ,
434
430
file = sys .stderr ,
435
431
)
436
432
437
- # Run with proper parameters
438
- warmup = int ( getattr ( tb_args , "warmup" , 25 ))
439
- rep = int ( getattr ( tb_args , "iter " , 100 ))
440
- op . run ( warmup = warmup , rep = rep )
433
+ # Add input-id and num-inputs to the tritonbench args before re-parsing
434
+ tritonbench_args . extend (
435
+ [ "--input-id" , str ( start_idx ) , "--num-inputs " , str ( shard_size )]
436
+ )
441
437
442
- # Print results
443
- print ("\n Benchmark Results:" , file = sys .stderr )
444
- print (op .output , file = sys .stderr )
438
+ # Re-parse args with the new input range
439
+ tb_args , unknown_args = tb_parser .parse_known_args (tritonbench_args )
445
440
446
- # Clean up memory after running the kernel
447
- # Delete the operator instance which contains all allocated tensors
448
- del op
441
+ # Use tritonbench's _run function which handles arg processing
442
+ _run (tb_args , unknown_args )
449
443
450
444
# Force garbage collection multiple times to ensure memory is freed
451
445
for _ in range (3 ):
0 commit comments