nod-ai · bangtianliu · Nov 18, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/sharktuner/dispatch_tuner/dispatch_tuner.py b/sharktuner/dispatch_tuner/dispatch_tuner.py
@@ -159,3 +159,11 @@ def main() -> None:
             print(path_config.run_log.resolve())
         print("Check the summary in:")
         print(summary_log_file.resolve())
+
+        output_csv_name = f"{args.dispatch_file.stem}_candidate_analysis.csv"
+        csv_path = Path(path_config.base_dir) / output_csv_name
+
+        libtuner.candidate_ordering.export_record_to_csv(
+            dispatch_tuner.tuning_records, csv_path
+        )
+        print(f"Wrote tuning records CSV: {csv_path}")
diff --git a/sharktuner/sharktuner/candidate_ordering.py b/sharktuner/sharktuner/candidate_ordering.py
@@ -1,7 +1,11 @@
-from enum import Enum
-from typing import Optional, Callable
 import random
 import logging
+import csv
+from typing import Optional, Any
+from dataclasses import dataclass
+from pathlib import Path
+from enum import Enum
+from typing import Optional, Callable
 
 from iree.compiler.dialects import iree_gpu  # type: ignore
 
@@ -104,3 +108,80 @@ def reorder_assignments(
             return indices
         case _:
             assert False
+
+
+@dataclass
+class TuningRecord:
+    """
+    Records a candidate's knob configuration and tuning results.
+
+    Used to analyze the candidate search space and to evaluate the
+    effectiveness of candidate ordering heuristics.
+    """
+
+    gen_id: int  # Original index from candidate generation.
+    candidate_id: int  # Index in candidate_trackers after reordering.
+    knob: Optional[common.KnobAssignment] = None
+    to_compile: bool = False
+    compile_status: bool = False
+    to_benchmark: bool = False
+    benchmark_device_id: Optional[str] = None
+    benchmark_queue_position: Optional[int] = None
+    benchmark_status: bool = False
+    baseline_benchmark_time_us: Optional[float] = None
+    benchmark_time_us: Optional[float] = None
+    benchmark_speedup: Optional[float] = None
+    benchmark_rank_order: Optional[int] = None
+
+
+def build_tuning_records_from_order(
+    knobs: list[Optional[common.KnobAssignment]], sorted_order: list[int]
+) -> list[TuningRecord]:
+    tuning_records: list[TuningRecord] = []
+    # candidate_id = 0 is the baseline and is not included in tuning_records.
+    for sorted_position, original_gen_index in enumerate(sorted_order, start=1):
+        tr = TuningRecord(
+            gen_id=original_gen_index,
+            candidate_id=sorted_position,
+            knob=knobs[original_gen_index],
+        )
+        tuning_records.append(tr)
+
+    return tuning_records
+
+
+def flatten_records(
+    tuning_records: list[TuningRecord],
+) -> list[dict[str, Any]]:
+    """
+    Flatten a list of `TuningRecord` objects into CSV headers and rows.
+
+    - Each record becomes one CSV row.
+    - Top-level attributes (e.g., `gen_id`, `benchmark_time_us`) appear as individual columns.
+    - Nested objects (e.g., `knob`) are flattened into columns like `knob.M`, `knob.tile_m`.
+    """
+    rows = []
+    for tuning_record in tuning_records:
+        row = {}
+        for attr, val in vars(tuning_record).items():
+            if isinstance(val, common.KnobAssignment):
+                knob_dict = val.get_knobs()
+                for k, v in knob_dict.items():
+                    row[f"{attr}_{k}"] = v
+            else:
+                row[attr] = val
+        rows.append(row)
+
+    return rows
+
+
+def export_record_to_csv(tuning_records: list[TuningRecord], dest_file: Path) -> None:
+    assert tuning_records
+
+    rows = flatten_records(tuning_records)
+    headers = list(rows[0].keys())
+
+    with open(dest_file, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=headers)
+        writer.writeheader()
+        writer.writerows(rows)
diff --git a/sharktuner/sharktuner/libtuner.py b/sharktuner/sharktuner/libtuner.py
@@ -125,6 +125,7 @@ def __init__(self, tuner_context: common.TunerContext):
         self.tuner_context = tuner_context
         self.candidate_trackers: list[CandidateTracker] = []
         self.target_info: Optional[iree_gpu.TargetInfo] = None
+        self.tuning_records: list[candidate_ordering.TuningRecord] = []
 
     @abstractmethod
     def get_iree_compile_flags(self) -> list[str]:
@@ -845,6 +846,10 @@ def generate_candidate_specs(
         # Total number of configs = candidates generated + baseline.
         assert len(config_specs) == len(solutions) + 1
 
+        tuning_client.tuning_records = (
+            candidate_ordering.build_tuning_records_from_order(knobs, sorted_order)
+        )
+
         knob_assignments = [dispatch_tuner.get_knob_assignment(s) for s in solutions]
         logging.debug("candidate_gen.py ends")
         handle_error(
@@ -1193,6 +1198,7 @@ def compile(
     # Set the source and output file paths for compilation of each candidate.
     path_config.compiled_dir.mkdir(parents=True, exist_ok=True)
     for i in candidates:
+        tuning_client.tuning_records[i].to_compile = True
         vmfb_file_name = path_config.get_candidate_vmfb_filename(
             tuning_client.candidate_trackers[i].candidate_id
         )
@@ -1231,6 +1237,7 @@ def compile(
     # Remove duplicate vmfbs from the candidate list.
     compiled_candidate_hashes = []
     for candidate_id in compiled_candidates:
+        tuning_client.tuning_records[candidate_id].compile_status = True
         candidate_vmfb = tuning_client.candidate_trackers[
             candidate_id
         ].compiled_vmfb_path
@@ -1283,6 +1290,9 @@ def benchmark(
             f"Smart candidate benchmark timeout is set to {subprocess_timeout_reference:.2f}s"
         )
     candidate_indices = [i for i in compiled_candidates if i != 0]
+    for i, idx in enumerate(candidate_indices, start=1):
+        tuning_client.tuning_records[idx].benchmark_queue_position = i
+        tuning_client.tuning_records[idx].to_benchmark = True
 
     candidate_results = benchmark_candidates(
         candidate_indices=candidate_indices,
@@ -1292,6 +1302,15 @@ def benchmark(
         benchmark_time=benchmark_time,  # Only candidate benchmark has time limit.
     )
 
+    for res in candidate_results:
+        c_id = res.candidate_id
+        res_time = res.time
+        tuning_client.tuning_records[c_id].benchmark_device_id = res.device_id
+        if res_time == math.inf:
+            continue
+        tuning_client.tuning_records[c_id].benchmark_status = True
+        tuning_client.tuning_records[c_id].benchmark_time_us = round(res_time, 2)
+
     second_baseline_result, _ = benchmark_baseline(
         devices=args.devices,
         tuning_client=tuning_client,
@@ -1315,6 +1334,18 @@ def benchmark(
         candidate_results,
         prune_slow_candidates=tuning_client.should_prune_slower_candidates(),
     )
+
+    # Best candidate gets rank 1.
+    for i, handler_res in enumerate(all_candidates_with_speedup, start=1):
+        benchmark_res, speedup = handler_res
+        cid, _, device_id = benchmark_res
+        baseline_res = baseline_handler.get_average_result_us(device_id)
+        tuning_client.tuning_records[cid].baseline_benchmark_time_us = (
+            round(baseline_res, 2) if baseline_res else None
+        )
+        tuning_client.tuning_records[cid].benchmark_speedup = round(speedup, 5)
+        tuning_client.tuning_records[cid].benchmark_rank_order = i
+
     top_candidates_with_speedup = (
         all_candidates_with_speedup[:num_candidates]
         if num_candidates