Skip to content

Commit c65540d

Browse files
committed
Initial implementation of comparison code
1 parent ef88ea0 commit c65540d

File tree

4 files changed

+317
-0
lines changed

4 files changed

+317
-0
lines changed
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import os
2+
import sys
3+
import json
4+
from pathlib import Path
5+
from dataclasses import dataclass, asdict
6+
7+
from utils.aggregate import SimpleMedian
8+
from utils.validate import Validate
9+
from utils.result import Result, BenchmarkRun
10+
from options import options
11+
12+
@dataclass
13+
class BenchmarkHistoricAverage:
14+
# Name of benchmark as defined in Benchmark class definition
15+
name: str
16+
17+
# Measure of central tendency used to compute "average"
18+
average_type: str
19+
# TODO replace this with Compare enum?
20+
# However, compare enum's use in the history is ambiguous, perhaps a new enum
21+
# should replace both
22+
23+
# Value recorded from the benchmark
24+
value: float
25+
# TODO "value" in compute_benchmark assumes median, what about tracking e.g.
26+
# standard deviation through this process?
27+
28+
# Arguments used to call the benchmark executable.
29+
#
30+
# This exists to ensure benchmarks called using different arguments are not
31+
# compared together.
32+
command_args: set[str]
33+
# TODO Ensure ONEAPI_DEVICE_SELECTOR? GPU name itself?
34+
35+
class Compare:
36+
37+
@staticmethod
38+
def get_hist_avg(
39+
result_name: str, result_dir: str, cutoff: str, aggregator=SimpleMedian,
40+
exclude: list[str] = []
41+
) -> dict[str, BenchmarkHistoricAverage]:
42+
43+
def get_timestamp(f: str) -> str:
44+
"""Extract timestamp from result filename"""
45+
return str(f)[-len("YYYYMMDD_HHMMSS.json") : -len(".json")]
46+
47+
def get_result_paths() -> list[str]:
48+
"""
49+
Get a list of all results matching result_name in result_dir that is
50+
newer than the timestamp specified by cutoff
51+
"""
52+
cache_dir = Path(f"{result_dir}")
53+
54+
# List is sorted by filename: given our timestamp format, the
55+
# timestamps are sorted from oldest to newest
56+
return sorted(
57+
filter(
58+
lambda f: f.is_file()
59+
and Validate.timestamp(get_timestamp(f))
60+
and get_timestamp(f) > cutoff
61+
# Result file is not excluded
62+
and f.stem not in exclude,
63+
# Assumes format is <name>_YYYYMMDD_HHMMSS.json
64+
cache_dir.glob(f"{result_name}_*_*.json")
65+
)
66+
)
67+
68+
# key: name of the benchmark test result
69+
# value: { command_args: set[str], aggregate: Aggregator }
70+
#
71+
# This is then used to build a dict[BenchmarkHistoricAverage] used
72+
# to find historic averages.
73+
average_aggregate: dict[str, dict] = dict()
74+
75+
for result_path in get_result_paths():
76+
with result_path.open('r') as result_f:
77+
result = BenchmarkRun.from_json(json.load(result_f))
78+
79+
if result.name != result_name:
80+
print(f"Warning: Result file {result_path} has mismatching name {result.name}. Skipping file.")
81+
continue
82+
83+
for test_run in result.results:
84+
def reset_aggregate() -> dict:
85+
return {
86+
"command_args": set(test_run.command[1:]),
87+
# The assumption here is that "value" is median
88+
# TODO standardization should happen here on what "value"
89+
# really is
90+
"aggregate": aggregator(starting_elements=[test_run.value])
91+
}
92+
93+
# Add every benchmark run to average_aggregate:
94+
if test_run.name not in average_aggregate:
95+
average_aggregate[test_run.name] = reset_aggregate()
96+
else:
97+
# Check that we are comparing runs with the same cmd args:
98+
if set(test_run.command[1:]) == average_aggregate[test_run.name]["command_args"]:
99+
average_aggregate[test_run.name]["aggregate"].add(test_run.value)
100+
else:
101+
# If the command args used between runs are different,
102+
# discard old run data and prefer new command args
103+
#
104+
# This relies on the fact that paths from get_result_paths()
105+
# is sorted from older to newer
106+
print(f"Warning: Command args for {test_run.name} from {result_path} is different from prior runs.")
107+
print("DISCARDING older data and OVERRIDING with data using new arg.")
108+
average_aggregate[test_run.name] = reset_aggregate()
109+
110+
return {
111+
name: BenchmarkHistoricAverage(
112+
name=name,
113+
average_type=stats["aggregate"].get_type(),
114+
value=stats["aggregate"].get_avg(),
115+
command_args=stats["command_args"]
116+
)
117+
for name, stats in average_aggregate.items()
118+
}
119+
120+
121+
def to_hist_avg(
122+
hist_avg: dict[str, BenchmarkHistoricAverage], compare_file: str
123+
) -> tuple:
124+
with open(compare_file, 'r') as compare_f:
125+
compare_result = BenchmarkRun.from_json(json.load(compare_f))
126+
127+
improvement = []
128+
regression = []
129+
130+
for test in compare_result.results:
131+
if test.name not in hist_avg:
132+
continue
133+
if hist_avg[test.name].command_args != set(test.command[1:]):
134+
print(f"Warning: skipped {test.name} due to command args mismatch.")
135+
continue
136+
137+
delta = 1 - (
138+
test.value / hist_avg[test.name].value
139+
if test.lower_is_better else
140+
hist_avg[test.name].value / test.value
141+
)
142+
143+
def perf_diff_entry() -> dict:
144+
res = asdict(test)
145+
res["delta"] = delta
146+
res["hist_avg"] = hist_avg[test.name].value
147+
res["avg_type"] = hist_avg[test.name].average_type
148+
return res
149+
150+
if delta > options.regression_threshold:
151+
improvement.append(perf_diff_entry())
152+
elif delta < -options.regression_threshold:
153+
regression.append(perf_diff_entry())
154+
155+
return improvement, regression
156+
157+
158+
159+
160+
def to_hist(
161+
avg_type: str, result_name: str, compare_name: str, result_dir: str, cutoff: str,
162+
163+
) -> tuple:
164+
"""
165+
This function generates a historic average from results named result_name
166+
in result_dir and compares it to the results in compare_file
167+
168+
Parameters:
169+
result_name (str): Save name of the result
170+
compare_name (str): Result file name to compare historic average against
171+
result_dir (str): Directory to look for results in
172+
cutoff (str): Timestamp (in YYYYMMDD_HHMMSS) indicating the oldest
173+
result included in the historic average calculation
174+
avg_type (str): Type of "average" (measure of central tendency) to
175+
use in historic "average" calculation
176+
"""
177+
178+
if avg_type != "median":
179+
print("Only median is currently supported: refusing to continue.")
180+
exit(1)
181+
182+
# TODO call validator on cutoff timestamp
183+
hist_avg = Compare.get_hist_avg(result_name, result_dir, cutoff, exclude=[compare_name])
184+
return Compare.to_hist_avg(hist_avg, f"{result_dir}/{compare_name}.json")
185+
186+
187+
res = Compare.to_hist("median", "Baseline_PVC_L0", "Baseline_PVC_L0_20250314_170754", "./", "00000000_000000")
188+
print(res)

devops/scripts/benchmarks/options.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,7 @@ class Options:
4545
preset: str = "Full"
4646
custom_results_dir = None
4747

48+
regression_threshold: float = 0.05
49+
4850

4951
options = Options()
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import heapq
2+
import statistics
3+
from abc import ABC, abstractmethod
4+
5+
6+
class Aggregator(ABC):
7+
"""
8+
Aggregator classes used to "aggregate" a pool of elements, and produce an
9+
"average" (precisely, some "measure of central tendency") from the elements.
10+
"""
11+
12+
@staticmethod
13+
@abstractmethod
14+
def get_type() -> str:
15+
"""
16+
Return a string indicating the type of average this aggregator
17+
produces.
18+
"""
19+
pass
20+
21+
@abstractmethod
22+
def add(self, n: float):
23+
"""
24+
Add/aggregate an element to the pool of elements used by this aggregator
25+
to produce an average calculation.
26+
"""
27+
pass
28+
29+
@abstractmethod
30+
def get_avg(self) -> float:
31+
"""
32+
Produce an average from the pool of elements aggregated using add().
33+
"""
34+
pass
35+
36+
37+
class SimpleMedian(Aggregator):
38+
"""
39+
Simple median calculation: if the number of samples being generated are low,
40+
this is the fastest median method.
41+
"""
42+
43+
def __init__(self, starting_elements: list = []):
44+
self.elements = starting_elements
45+
46+
@staticmethod
47+
def get_type() -> str:
48+
return "median"
49+
50+
def add(self, n: float):
51+
self.elements.append(n)
52+
53+
def get_avg(self) -> float:
54+
return statistics.median(self.elements)
55+
56+
57+
class StreamingMedian(Aggregator):
58+
"""
59+
Calculate medians incrementally using heaps: Theoretically the fastest way
60+
to calculate a median from a stream of elements, but realistically is only
61+
faster when dealing with huge numbers of samples that would be generated by
62+
i.e. enabling this workflow in precommit and using longer periods of time.
63+
"""
64+
65+
def __init__(self, starting_elements: list = []):
66+
# Gist: we keep a minheap and a maxheap, and store the median as the top
67+
# of the minheap. When a new element comes it gets put into the heap
68+
# based on if the element is bigger than the current median. Then, the
69+
# heaps are heapified and the median is repopulated by heapify.
70+
self.minheap_larger = []
71+
self.maxheap_smaller = []
72+
73+
map(lambda n: self.add(n), starting_elements)
74+
75+
@staticmethod
76+
def get_type() -> str:
77+
return "median"
78+
79+
# Note: numbers on maxheap should be negative, as heapq
80+
# is minheap by default
81+
82+
def add(self, n: float):
83+
if len(self.maxheap_smaller) == 0 or -self.maxheap_smaller[0] >= n:
84+
heapq.heappush(self.maxheap_smaller, -n)
85+
else:
86+
heapq.heappush(self.minheap_larger, n)
87+
88+
# Ensure minheap has more elements than maxheap
89+
if len(self.maxheap_smaller) > len(self.minheap_larger) + 1:
90+
heapq.heappush(self.minheap_larger, -heapq.heappop(self.maxheap_smaller))
91+
elif len(self.maxheap_smaller) < len(self.minheap_larger):
92+
heapq.heappush(self.maxheap_smaller, -heapq.heappop(self.minheap_larger))
93+
94+
def get_avg(self) -> float:
95+
if len(self.maxheap_smaller) == len(self.minheap_larger):
96+
# Equal number of elements smaller and larger than "median":
97+
# thus, there are two median values. The median would then become
98+
# the average of both median values.
99+
return (-self.maxheap_smaller[0] + self.minheap_larger[0]) / 2.0
100+
else:
101+
# Otherwise, median is always in minheap, as minheap is always
102+
# bigger
103+
return -self.maxheap_smaller[0]
104+
105+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import re
2+
3+
class Validate:
4+
"""Static class containing methods for validating various fields"""
5+
6+
@staticmethod
7+
def filepath(path: str) -> bool:
8+
"""
9+
Returns True if path is clean (no illegal characters), otherwise False.
10+
"""
11+
filepath_re = re.compile(r"[a-zA-Z0-9\/\._\-]+")
12+
return filepath_re.match(path) is not None
13+
14+
@staticmethod
15+
def timestamp(t: str) -> bool:
16+
"""
17+
Returns True if t is in form YYYYMMDD_HHMMSS, otherwise False.
18+
"""
19+
timestamp_re = re.compile(
20+
r"^\d{4}(0[1-9]|1[0-2])([0-2][0-9]|3[01])_([01][0-9]|2[0-3])[0-5][0-9][0-5][0-9]$"
21+
)
22+
return timestamp_re.match(t) is not None

0 commit comments

Comments
 (0)