add equal-spaced mode

yf225 · yf225 · commit e01b2d023f80 · 2025-08-05T22:27:04.000-07:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -14,6 +14,9 @@
 
 # On GPU-1, run first 1/4 of inputs for all kernels and save results to CSV in the current directory
 $ CUDA_VISIBLE_DEVICES=1 python benchmarks/run.py --input-shard 1/4 --metrics accuracy,tflops,gbps,speedup --csv --output-dir ./
+
+# Run 5 equally-spaced inputs instead of the first 5
+$ python benchmarks/run.py --kernel vector_add --num-inputs 5 --input-sample-mode equal-spaced
 """
 
 from __future__ import annotations
@@ -267,6 +270,7 @@ def run_kernel(
     kernel_name: str,
     tritonbench_args: list[str],
     input_shard_info: tuple[int, int] | None = None,
+    input_sample_mode: str = "first-n",
 ) -> None:
     """Run a kernel benchmark, handling both single and multiple variants."""
     # Check if kernel is in the mapping table
@@ -313,6 +317,7 @@ def run_kernel(
         tritonbench_args,
         input_shard_info,
         operator_args,
+        input_sample_mode,
     )
 
 
@@ -323,6 +328,7 @@ def run_kernel_variants(
     tritonbench_args: list[str],
     input_shard_info: tuple[int, int] | None = None,
     operator_args: dict[str, Any] | None = None,
+    input_sample_mode: str = "first-n",
 ) -> None:
     """Run kernel variants in the same benchmark run."""
 
@@ -461,18 +467,61 @@ def _inner() -> Callable[..., Any] | object:
 
     from tritonbench.run import _run
 
-    # Handle input sharding if requested
+    # Get the actual number of inputs for this operator
+    total_inputs = Operator(
+        tb_args=tb_args, extra_args=unknown_args
+    )._available_num_inputs
+
+    # First, handle input sampling based on --num-inputs and --input-sample-mode
+    selected_indices = None
+    
+    if "--num-inputs" in tritonbench_args:
+        # Make a copy to avoid modifying the original list
+        tritonbench_args = tritonbench_args.copy()
+        # Extract num-inputs value
+        num_inputs_idx = tritonbench_args.index("--num-inputs")
+        if num_inputs_idx + 1 < len(tritonbench_args):
+            num_inputs = int(tritonbench_args[num_inputs_idx + 1])
+            
+            if input_sample_mode == "equal-spaced":
+                # Calculate equal-spaced indices
+                if num_inputs >= total_inputs:
+                    # If requesting more inputs than available, just use all
+                    selected_indices = list(range(total_inputs))
+                else:
+                    # Calculate step size for equal spacing
+                    step = (total_inputs - 1) / (num_inputs - 1) if num_inputs > 1 else 0
+                    selected_indices = [int(round(i * step)) for i in range(num_inputs)]
+                
+                print(
+                    f"Step 1 - Equal-spaced sampling: {num_inputs} inputs from {total_inputs} total",
+                    file=sys.stderr,
+                )
+                print(f"  Selected indices: {selected_indices}", file=sys.stderr)
+            else:
+                # first-n mode: select first N inputs
+                selected_indices = list(range(min(num_inputs, total_inputs)))
+                print(
+                    f"Step 1 - First-n sampling: {num_inputs} inputs from {total_inputs} total",
+                    file=sys.stderr,
+                )
+                print(f"  Selected indices: {selected_indices}", file=sys.stderr)
+            
+            # Remove --num-inputs from args since we'll handle it differently
+            tritonbench_args.pop(num_inputs_idx)  # Remove --num-inputs
+            tritonbench_args.pop(num_inputs_idx)  # Remove the value
+    else:
+        # No sampling requested, use all inputs
+        selected_indices = list(range(total_inputs))
+
+    # Then, handle sharding if requested
     if input_shard_info:
         shard_idx, total_shards = input_shard_info
-
-        # Get the actual number of inputs for this operator
-        total_inputs = Operator(
-            tb_args=tb_args, extra_args=unknown_args
-        )._available_num_inputs
-
-        # Calculate shard boundaries
-        inputs_per_shard = total_inputs // total_shards
-        extra_inputs = total_inputs % total_shards
+        
+        # Calculate shard boundaries on the selected indices
+        num_selected = len(selected_indices)
+        inputs_per_shard = num_selected // total_shards
+        extra_inputs = num_selected % total_shards
 
         if shard_idx <= extra_inputs:
             start_idx = (shard_idx - 1) * (inputs_per_shard + 1)
@@ -484,15 +533,21 @@ def _inner() -> Callable[..., Any] | object:
             )
             shard_size = inputs_per_shard
 
+        # Get the actual indices for this shard
+        shard_indices = selected_indices[start_idx:start_idx + shard_size]
+        
         print(
-            f"Running input shard {shard_idx}/{total_shards}: inputs {start_idx} to {start_idx + shard_size - 1} (of {total_inputs} total)",
+            f"Step 2 - Sharding: shard {shard_idx}/{total_shards} gets {len(shard_indices)} inputs",
             file=sys.stderr,
         )
+        print(f"  Shard indices: {shard_indices}", file=sys.stderr)
+        
+        # Update selected_indices to only include this shard
+        selected_indices = shard_indices
 
-        # Add input-id and num-inputs to the tritonbench args before re-parsing
-        tritonbench_args.extend(
-            ["--input-id", str(start_idx), "--num-inputs", str(shard_size)]
-        )
+    # Add the final selected indices to tritonbench args
+    if selected_indices is not None and len(selected_indices) > 0 and len(selected_indices) < total_inputs:
+        tritonbench_args.extend(["--input-id", ",".join(map(str, selected_indices))])
 
     # Re-parse args with the new input range
     tb_args, unknown_args = tb_parser.parse_known_args(tritonbench_args)
@@ -523,6 +578,13 @@ def main() -> None:
         type=str,
         help="Run only a subset of inputs for each kernel. Format: M/N where M is the shard number (1-indexed) and N is the total number of shards. For example, --input-shard 1/3 runs the first third of inputs for each kernel.",
     )
+    parser.add_argument(
+        "--input-sample-mode",
+        type=str,
+        choices=["first-n", "equal-spaced"],
+        default="first-n",
+        help="How to sample inputs when using --num-inputs. 'first-n' (default) takes the first X inputs. 'equal-spaced' takes X inputs equally spaced throughout the input list.",
+    )
 
     # Parse known args to get the kernel name, pass rest to tritonbench
     args, tritonbench_args = parser.parse_known_args()
@@ -568,7 +630,7 @@ def main() -> None:
 
         # Run specified kernels
         if len(kernel_names) == 1:
-            run_kernel(kernel_names[0], tritonbench_args, input_shard_info)
+            run_kernel(kernel_names[0], tritonbench_args, input_shard_info, args.input_sample_mode)
         else:
             print(
                 f"Running {len(kernel_names)} kernels: {', '.join(kernel_names)}...\n",
@@ -578,15 +640,15 @@ def main() -> None:
                 print(f"\n{'=' * 60}", file=sys.stderr)
                 print(f"Kernel: {kernel_name}", file=sys.stderr)
                 print(f"{'=' * 60}\n", file=sys.stderr)
-                run_kernel(kernel_name, tritonbench_args.copy(), input_shard_info)
+                run_kernel(kernel_name, tritonbench_args.copy(), input_shard_info, args.input_sample_mode)
     else:
         # Run all kernels
         print(f"Running all {len(KERNEL_MAPPINGS)} kernels...\n", file=sys.stderr)
         for kernel_name in KERNEL_MAPPINGS:
             print(f"\n{'=' * 60}", file=sys.stderr)
             print(f"Kernel: {kernel_name}", file=sys.stderr)
             print(f"{'=' * 60}\n", file=sys.stderr)
-            run_kernel(kernel_name, tritonbench_args.copy(), input_shard_info)
+            run_kernel(kernel_name, tritonbench_args.copy(), input_shard_info, args.input_sample_mode)
 
 
 if __name__ == "__main__":