diff --git a/sharktuner/dispatch_tuner/dispatch_tuner.py b/sharktuner/dispatch_tuner/dispatch_tuner.py index 03a29a97bb2..ac159da392e 100644 --- a/sharktuner/dispatch_tuner/dispatch_tuner.py +++ b/sharktuner/dispatch_tuner/dispatch_tuner.py @@ -159,3 +159,11 @@ def main() -> None: print(path_config.run_log.resolve()) print("Check the summary in:") print(summary_log_file.resolve()) + + output_csv_name = f"{args.dispatch_file.stem}_candidate_analysis.csv" + csv_path = Path(path_config.base_dir) / output_csv_name + + libtuner.candidate_ordering.export_record_to_csv( + dispatch_tuner.tuning_records, csv_path + ) + print(f"Wrote tuning records CSV: {csv_path}") diff --git a/sharktuner/sharktuner/candidate_ordering.py b/sharktuner/sharktuner/candidate_ordering.py index f61db9f021c..7a20020df45 100644 --- a/sharktuner/sharktuner/candidate_ordering.py +++ b/sharktuner/sharktuner/candidate_ordering.py @@ -1,7 +1,11 @@ -from enum import Enum -from typing import Optional, Callable import random import logging +import csv +from typing import Optional, Any +from dataclasses import dataclass +from pathlib import Path +from enum import Enum +from typing import Optional, Callable from iree.compiler.dialects import iree_gpu # type: ignore @@ -104,3 +108,80 @@ def reorder_assignments( return indices case _: assert False + + +@dataclass +class TuningRecord: + """ + Records a candidate's knob configuration and tuning results. + + Used to analyze the candidate search space and to evaluate the + effectiveness of candidate ordering heuristics. + """ + + gen_id: int # Original index from candidate generation. + candidate_id: int # Index in candidate_trackers after reordering. + knob: Optional[common.KnobAssignment] = None + to_compile: bool = False + compile_status: bool = False + to_benchmark: bool = False + benchmark_device_id: Optional[str] = None + benchmark_queue_position: Optional[int] = None + benchmark_status: bool = False + baseline_benchmark_time_us: Optional[float] = None + benchmark_time_us: Optional[float] = None + benchmark_speedup: Optional[float] = None + benchmark_rank_order: Optional[int] = None + + +def build_tuning_records_from_order( + knobs: list[Optional[common.KnobAssignment]], sorted_order: list[int] +) -> list[TuningRecord]: + tuning_records: list[TuningRecord] = [] + # candidate_id = 0 is the baseline and is not included in tuning_records. + for sorted_position, original_gen_index in enumerate(sorted_order, start=1): + tr = TuningRecord( + gen_id=original_gen_index, + candidate_id=sorted_position, + knob=knobs[original_gen_index], + ) + tuning_records.append(tr) + + return tuning_records + + +def flatten_records( + tuning_records: list[TuningRecord], +) -> list[dict[str, Any]]: + """ + Flatten a list of `TuningRecord` objects into CSV headers and rows. + + - Each record becomes one CSV row. + - Top-level attributes (e.g., `gen_id`, `benchmark_time_us`) appear as individual columns. + - Nested objects (e.g., `knob`) are flattened into columns like `knob.M`, `knob.tile_m`. + """ + rows = [] + for tuning_record in tuning_records: + row = {} + for attr, val in vars(tuning_record).items(): + if isinstance(val, common.KnobAssignment): + knob_dict = val.get_knobs() + for k, v in knob_dict.items(): + row[f"{attr}_{k}"] = v + else: + row[attr] = val + rows.append(row) + + return rows + + +def export_record_to_csv(tuning_records: list[TuningRecord], dest_file: Path) -> None: + assert tuning_records + + rows = flatten_records(tuning_records) + headers = list(rows[0].keys()) + + with open(dest_file, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=headers) + writer.writeheader() + writer.writerows(rows) diff --git a/sharktuner/sharktuner/libtuner.py b/sharktuner/sharktuner/libtuner.py index 111b24809ec..6b1a3ddd6a6 100644 --- a/sharktuner/sharktuner/libtuner.py +++ b/sharktuner/sharktuner/libtuner.py @@ -125,6 +125,7 @@ def __init__(self, tuner_context: common.TunerContext): self.tuner_context = tuner_context self.candidate_trackers: list[CandidateTracker] = [] self.target_info: Optional[iree_gpu.TargetInfo] = None + self.tuning_records: list[candidate_ordering.TuningRecord] = [] @abstractmethod def get_iree_compile_flags(self) -> list[str]: @@ -845,6 +846,10 @@ def generate_candidate_specs( # Total number of configs = candidates generated + baseline. assert len(config_specs) == len(solutions) + 1 + tuning_client.tuning_records = ( + candidate_ordering.build_tuning_records_from_order(knobs, sorted_order) + ) + knob_assignments = [dispatch_tuner.get_knob_assignment(s) for s in solutions] logging.debug("candidate_gen.py ends") handle_error( @@ -1193,6 +1198,7 @@ def compile( # Set the source and output file paths for compilation of each candidate. path_config.compiled_dir.mkdir(parents=True, exist_ok=True) for i in candidates: + tuning_client.tuning_records[i].to_compile = True vmfb_file_name = path_config.get_candidate_vmfb_filename( tuning_client.candidate_trackers[i].candidate_id ) @@ -1231,6 +1237,7 @@ def compile( # Remove duplicate vmfbs from the candidate list. compiled_candidate_hashes = [] for candidate_id in compiled_candidates: + tuning_client.tuning_records[candidate_id].compile_status = True candidate_vmfb = tuning_client.candidate_trackers[ candidate_id ].compiled_vmfb_path @@ -1283,6 +1290,9 @@ def benchmark( f"Smart candidate benchmark timeout is set to {subprocess_timeout_reference:.2f}s" ) candidate_indices = [i for i in compiled_candidates if i != 0] + for i, idx in enumerate(candidate_indices, start=1): + tuning_client.tuning_records[idx].benchmark_queue_position = i + tuning_client.tuning_records[idx].to_benchmark = True candidate_results = benchmark_candidates( candidate_indices=candidate_indices, @@ -1292,6 +1302,15 @@ def benchmark( benchmark_time=benchmark_time, # Only candidate benchmark has time limit. ) + for res in candidate_results: + c_id = res.candidate_id + res_time = res.time + tuning_client.tuning_records[c_id].benchmark_device_id = res.device_id + if res_time == math.inf: + continue + tuning_client.tuning_records[c_id].benchmark_status = True + tuning_client.tuning_records[c_id].benchmark_time_us = round(res_time, 2) + second_baseline_result, _ = benchmark_baseline( devices=args.devices, tuning_client=tuning_client, @@ -1315,6 +1334,18 @@ def benchmark( candidate_results, prune_slow_candidates=tuning_client.should_prune_slower_candidates(), ) + + # Best candidate gets rank 1. + for i, handler_res in enumerate(all_candidates_with_speedup, start=1): + benchmark_res, speedup = handler_res + cid, _, device_id = benchmark_res + baseline_res = baseline_handler.get_average_result_us(device_id) + tuning_client.tuning_records[cid].baseline_benchmark_time_us = ( + round(baseline_res, 2) if baseline_res else None + ) + tuning_client.tuning_records[cid].benchmark_speedup = round(speedup, 5) + tuning_client.tuning_records[cid].benchmark_rank_order = i + top_candidates_with_speedup = ( all_candidates_with_speedup[:num_candidates] if num_candidates diff --git a/sharktuner/tests/candidate_ordering_test.py b/sharktuner/tests/candidate_ordering_test.py index ab0bafd9ec1..fc9654b315f 100644 --- a/sharktuner/tests/candidate_ordering_test.py +++ b/sharktuner/tests/candidate_ordering_test.py @@ -6,6 +6,7 @@ import math import pytest +from typing import Optional from iree.compiler import ir # type: ignore from iree.compiler.dialects import iree_gpu # type: ignore @@ -13,60 +14,63 @@ from sharktuner import candidate_ordering, common -knob_1 = common.LLVMGPUVectorDistributeContractionKnobs( - M=2048, - N=10240, - K=1280, - tile_m=128, - tile_n=64, - tile_k=64, - wg_x=64, - wg_y=2, - wg_z=1, - subgroup_m_cnt=2, - subgroup_n_cnt=1, - intrinsic_mn=32, - intrinsic_k=8, - subgroup_m=0, - subgroup_n=0, - subgroup_k=0, -) -knob_2 = common.LLVMGPUVectorDistributeContractionKnobs( - M=2048, - N=10240, - K=1280, - tile_m=64, - tile_n=320, - tile_k=80, - wg_x=320, - wg_y=1, - wg_z=1, - subgroup_m_cnt=1, - subgroup_n_cnt=5, - intrinsic_mn=16, - intrinsic_k=16, - subgroup_m=0, - subgroup_n=0, - subgroup_k=0, -) -knob_3 = common.LLVMGPUVectorDistributeContractionKnobs( - M=2048, - N=10240, - K=1280, - tile_m=64, - tile_n=256, - tile_k=16, - wg_x=256, - wg_y=2, - wg_z=1, - subgroup_m_cnt=2, - subgroup_n_cnt=4, - intrinsic_mn=16, - intrinsic_k=16, - subgroup_m=0, - subgroup_n=0, - subgroup_k=0, -) +@pytest.fixture +def sample_knobs() -> list[Optional[common.KnobAssignment]]: + knob_1 = common.LLVMGPUVectorDistributeContractionKnobs( + M=2048, + N=10240, + K=1280, + tile_m=128, + tile_n=64, + tile_k=64, + wg_x=64, + wg_y=2, + wg_z=1, + subgroup_m_cnt=2, + subgroup_n_cnt=1, + intrinsic_mn=32, + intrinsic_k=8, + subgroup_m=0, + subgroup_n=0, + subgroup_k=0, + ) + knob_2 = common.LLVMGPUVectorDistributeContractionKnobs( + M=2048, + N=10240, + K=1280, + tile_m=64, + tile_n=320, + tile_k=80, + wg_x=320, + wg_y=1, + wg_z=1, + subgroup_m_cnt=1, + subgroup_n_cnt=5, + intrinsic_mn=16, + intrinsic_k=16, + subgroup_m=0, + subgroup_n=0, + subgroup_k=0, + ) + knob_3 = common.LLVMGPUVectorDistributeContractionKnobs( + M=2048, + N=10240, + K=1280, + tile_m=64, + tile_n=256, + tile_k=16, + wg_x=256, + wg_y=2, + wg_z=1, + subgroup_m_cnt=2, + subgroup_n_cnt=4, + intrinsic_mn=16, + intrinsic_k=16, + subgroup_m=0, + subgroup_n=0, + subgroup_k=0, + ) + return [knob_1, knob_2, knob_3] @pytest.fixture @@ -100,14 +104,15 @@ def test_math_expression() -> None: assert math.isclose(ai, expected, rel_tol=1e-9) -def test_reorder_assignments(target_info: iree_gpu.TargetInfo) -> None: - knobs: list[common.KnobAssignment | None] = [knob_1, knob_2, knob_3] - +def test_reorder_assignments( + target_info: iree_gpu.TargetInfo, + sample_knobs: list[Optional[common.KnobAssignment]], +) -> None: expected_order = [0, 1, 2] assert ( candidate_ordering.reorder_assignments( target_info=target_info, - knobs=knobs, + knobs=sample_knobs, strategy=candidate_ordering.CandidateOrderKind.no_sort, ) == expected_order @@ -117,7 +122,7 @@ def test_reorder_assignments(target_info: iree_gpu.TargetInfo) -> None: assert ( candidate_ordering.reorder_assignments( target_info=target_info, - knobs=knobs, + knobs=sample_knobs, strategy=candidate_ordering.CandidateOrderKind.heuristic, ) == expected_order @@ -126,14 +131,14 @@ def test_reorder_assignments(target_info: iree_gpu.TargetInfo) -> None: expected_order = [0, 2, 1] assert ( candidate_ordering.reorder_assignments( - knobs=knobs, + knobs=sample_knobs, strategy=candidate_ordering.CandidateOrderKind.heuristic, key_fn=lambda knob: knob.tile_n, ) == expected_order ) - knobs = [None, None, None] + knobs: list[Optional[common.KnobAssignment]] = [None, None, None] assert ( candidate_ordering.reorder_assignments( target_info=target_info, @@ -152,3 +157,108 @@ def test_reorder_assignments(target_info: iree_gpu.TargetInfo) -> None: ) == [] ) + + +def test_build_tuning_records_from_order( + sample_knobs: list[Optional[common.KnobAssignment]], +) -> None: + tr1 = candidate_ordering.TuningRecord( + gen_id=2, + candidate_id=1, + knob=sample_knobs[2], + ) + tr2 = candidate_ordering.TuningRecord( + gen_id=0, + candidate_id=2, + knob=sample_knobs[0], + ) + tr3 = candidate_ordering.TuningRecord( + gen_id=1, + candidate_id=3, + knob=sample_knobs[1], + ) + sorted_order = [2, 0, 1] + tuning_records = candidate_ordering.build_tuning_records_from_order( + sample_knobs, sorted_order + ) + + assert tuning_records == [tr1, tr2, tr3] + + +def test_flatten_records( + sample_knobs: list[Optional[common.KnobAssignment]], +): + tr1 = candidate_ordering.TuningRecord( + gen_id=2, + candidate_id=1, + knob=sample_knobs[2], + to_compile=True, + benchmark_device_id="hip://2", + benchmark_queue_position=1, + baseline_benchmark_time_us=123.4, + benchmark_speedup=1.5, + ) + tr2 = candidate_ordering.TuningRecord( + gen_id=1, + candidate_id=2, + knob=sample_knobs[1], + to_benchmark=True, + benchmark_time_us=153.56, + ) + sample_tuning_records = [tr1, tr2] + + rows = candidate_ordering.flatten_records(sample_tuning_records) + + expected_key_rows: list[dict] = [ + { + "baseline_benchmark_time_us": 123.4, + "benchmark_device_id": "hip://2", + "benchmark_queue_position": 1, + "benchmark_speedup": 1.5, + "candidate_id": 1, + "gen_id": 2, + "knob_K": 1280, + "knob_M": 2048, + "knob_N": 10240, + "knob_intrinsic_k": 16, + "knob_intrinsic_mn": 16, + "knob_subgroup_k": 0, + "knob_subgroup_m": 0, + "knob_subgroup_m_cnt": 2, + "knob_subgroup_n": 0, + "knob_subgroup_n_cnt": 4, + "knob_tile_k": 16, + "knob_tile_m": 64, + "knob_tile_n": 256, + "knob_wg_x": 256, + "knob_wg_y": 2, + "knob_wg_z": 1, + "to_compile": True, + }, + { + "benchmark_time_us": 153.56, + "candidate_id": 2, + "gen_id": 1, + "knob_K": 1280, + "knob_M": 2048, + "knob_N": 10240, + "knob_intrinsic_k": 16, + "knob_intrinsic_mn": 16, + "knob_subgroup_k": 0, + "knob_subgroup_m": 0, + "knob_subgroup_m_cnt": 1, + "knob_subgroup_n": 0, + "knob_subgroup_n_cnt": 5, + "knob_tile_k": 80, + "knob_tile_m": 64, + "knob_tile_n": 320, + "knob_wg_x": 320, + "knob_wg_y": 1, + "knob_wg_z": 1, + "to_benchmark": True, + }, + ] + + for expected_key_row, actual_row in zip(expected_key_rows, rows): + for attr, val in expected_key_row.items(): + assert actual_row[attr] == val