Skip to content

Commit 317179e

Browse files
authored
Remove double baseline calculations for CI microbenchmarks (#2613)
1 parent aec9a79 commit 317179e

11 files changed

+176
-85
lines changed

benchmarks/dashboard/ci_microbenchmark_runner.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
125125
benchmark_name="TorchAO Quantization Benchmark",
126126
shape=[config.m, config.k, config.n],
127127
metric_name="Fwd Speedup (x)",
128-
metric_values=[result.speedup],
128+
metric_values=[result.compile_speedup_on_baseline],
129129
quant_type=config.quantization,
130130
device=config.device,
131131
torch_compile_mode=config.torch_compile_mode,
@@ -135,7 +135,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
135135
benchmark_name="TorchAO Quantization Benchmark",
136136
shape=[config.m, config.k, config.n],
137137
metric_name="Bfloat16 Fwd Time (ms)",
138-
metric_values=[result.baseline_inference_time_in_ms],
138+
metric_values=[result.baseline_model_compiled_inference_time_in_ms],
139139
quant_type=config.quantization,
140140
device=config.device,
141141
torch_compile_mode=config.torch_compile_mode,
@@ -148,7 +148,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
148148
benchmark_name="TorchAO Quantization Benchmark",
149149
shape=[config.m, config.k, config.n],
150150
metric_name="Quantized Fwd Time (ms)",
151-
metric_values=[result.model_inference_time_in_ms],
151+
metric_values=[result.quantized_model_compiled_inference_time_in_ms],
152152
quant_type=config.quantization,
153153
device=config.device,
154154
torch_compile_mode=config.torch_compile_mode,
@@ -175,6 +175,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
175175

176176

177177
def main():
178+
torch.manual_seed(42)
178179
parser = argparse.ArgumentParser(
179180
description="Run microbenchmarks and output results in PyTorch OSS benchmark database format"
180181
)

benchmarks/dashboard/microbenchmark_quantization_config.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ model_params:
1414
min_power: 10
1515
max_power: 15
1616
high_precision_dtype: "torch.bfloat16"
17-
use_torch_compile: true
1817
torch_compile_mode: "max-autotune"
1918
device: "cuda"
2019
model_type: "linear"

benchmarks/microbenchmarks/benchmark_inference.py

Lines changed: 125 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os
1414
from copy import deepcopy
1515
from pathlib import Path
16+
from typing import Dict, Tuple
1617

1718
import torch
1819

@@ -34,15 +35,72 @@
3435
create_model_and_input_data,
3536
)
3637

38+
# -----------------------------------------------------------------------------
39+
# Baseline caching
40+
#
41+
# ``_BASELINE_CACHE`` maps a unique key constructed using _make_cache_key(config) -> (model_type, m, k, n, high_precision_dtype, device, torch_compile_mode) to a tuple
42+
# ``(eager_baseline_time, compile_baseline_time)``. See ``_make_cache_key`` for the key
43+
# construction. Users should not access this cache directly; it is
44+
# internal to this module.
45+
# Eg: (linear, 1024, 1024, 1024, torch.bfloat16, cuda, default) -> (95.00, 56.00)
46+
# The cache is used to store the baseline inference time for a given configuration, which is further used to calculate speedup metrics.
47+
# This helps in removing multiple baseline calculations, which in turn helps in reducing the benchmarking time.
48+
# -----------------------------------------------------------------------------
49+
50+
_BASELINE_CACHE: Dict[Tuple, Tuple[float, float]] = {}
51+
52+
53+
def _make_cache_key(config: BenchmarkConfig) -> Tuple:
54+
"""Create a key for caching based on benchmark configuration.
55+
56+
Parameters that affect baseline performance are included:
57+
58+
* model type (e.g. ``linear`` or ``transformer_block``)
59+
* shape dimensions (m, k, n)
60+
* high precision dtype (bf16, fp16, etc.)
61+
* device (cuda, cpu, mps)
62+
* compile settings (whether compile is enabled and compile mode)
63+
64+
Sparsity and quantization settings are deliberately excluded
65+
because the baseline (non‑quantized, non‑sparse) performance is
66+
independent of those attributes.
67+
"""
68+
return (
69+
config.model_type,
70+
config.m,
71+
config.k,
72+
config.n,
73+
config.high_precision_dtype,
74+
config.device,
75+
config.torch_compile_mode,
76+
)
77+
3778

3879
def run(config: BenchmarkConfig) -> BenchmarkResult:
39-
"""Run inference benchmarks"""
80+
"""
81+
Run inference benchmarks.
82+
83+
The function first checks if a baseline for the given configuration
84+
already exists in the internal cache. If not, it measures the baseline
85+
inference time and stores the result. When the baseline is cached,
86+
the function reuses the cached baselines to calculate speedup metrics.
87+
88+
Args:
89+
config (BenchmarkConfig): Benchmark configuration.
90+
91+
Returns:
92+
BenchmarkResult: Result of the benchmark.
93+
"""
4094
try:
4195
clean_caches() # Clean caches
4296

4397
# Create output directory if it doesn't exist
4498
Path(config.output_dir).mkdir(parents=True, exist_ok=True)
4599

100+
# Prepare result container
101+
result = BenchmarkResult(config=config)
102+
103+
# Create model and input data
46104
base_model, input_data = create_model_and_input_data(
47105
config.model_type,
48106
config.m,
@@ -51,28 +109,47 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
51109
high_precision_dtype=config.high_precision_dtype,
52110
device=config.device,
53111
)
54-
# Copy base model for quantizing
55-
m_copy = deepcopy(base_model)
56112

57-
# Run benchmarks
58-
result = BenchmarkResult(config=config)
113+
# Generate a cache key for the current configuration
114+
cache_key = _make_cache_key(config)
59115

60-
# Store result in model for memory profiling
61-
base_model._benchmark_result = result
62-
63-
# Run baseline benchmarking
64-
base_model = base_model.eval().to(config.device)
65-
if config.use_torch_compile:
66-
print("Compiling baseline model....")
67-
base_model = torch.compile(
68-
base_model, mode=config.torch_compile_mode, fullgraph=True
116+
# Check if the baseline for this configuration has been computed
117+
if cache_key not in _BASELINE_CACHE:
118+
# Switch model to eval and move to device
119+
m_copy = deepcopy(base_model)
120+
m_copy = m_copy.eval().to(config.device)
121+
print("Benchmarking eager baseline inference.....")
122+
eager_baseline_time = model_inference_time_in_ms(
123+
model=m_copy, input_data=input_data
69124
)
70-
# Benchmark time to run an inference call for baseline model
71-
print("Benchmarking baseline inference.....")
72-
result.baseline_inference_time_in_ms = model_inference_time_in_ms(
73-
model=base_model, input_data=input_data
74-
)
75125

126+
print("Benchmarking compile baseline inference.....")
127+
m_copy = torch.compile(
128+
m_copy, mode=config.torch_compile_mode, fullgraph=True
129+
)
130+
compile_baseline_time = model_inference_time_in_ms(
131+
model=m_copy, input_data=input_data
132+
)
133+
134+
# Store uncompiled model, input and baseline time
135+
_BASELINE_CACHE[cache_key] = (eager_baseline_time, compile_baseline_time)
136+
137+
result.baseline_model_eager_inference_time_in_ms = eager_baseline_time
138+
result.baseline_model_compiled_inference_time_in_ms = compile_baseline_time
139+
else:
140+
# Retrieve cached values
141+
cached_eager_time, cached_compile_time = _BASELINE_CACHE[cache_key]
142+
result.baseline_model_eager_inference_time_in_ms = cached_eager_time
143+
result.baseline_model_compiled_inference_time_in_ms = cached_compile_time
144+
145+
# At this point, ``base_model`` is an uncompiled model ready for quantization,
146+
# and ``input_data`` is the corresponding input tensor. The baseline time
147+
# has been stored in ``result.baseline_inference_time_in_ms``.
148+
149+
# Copy base model for quantizing/sparsifying
150+
m_copy = deepcopy(base_model)
151+
152+
# Determine quantization/sparsity configuration
76153
ao_base_config = string_to_config(
77154
config.quantization,
78155
config.sparsity,
@@ -101,24 +178,39 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
101178
m_copy = m_copy.eval().to(config.device)
102179
quantize_(m_copy, ao_base_config)
103180

104-
if config.use_torch_compile:
105-
print("Compiling quantized model....")
106-
m_copy = torch.compile(
107-
m_copy, mode=config.torch_compile_mode, fullgraph=True
108-
)
109-
110181
# Store result in model for memory profiling
111182
m_copy._benchmark_result = result
112183

113-
# Benchmark time to run an inference call for quantized model
114-
print("Benchmarking quantized model.....")
115-
result.model_inference_time_in_ms = model_inference_time_in_ms(
184+
# Measure inference time for quantized model
185+
print("Benchmarking eager quantized model.....")
186+
result.quantized_model_eager_inference_time_in_ms = model_inference_time_in_ms(
116187
model=m_copy, input_data=input_data
117188
)
118189

119-
# Calculate speedup w.r.t. baseline
120-
result.speedup = round(
121-
result.baseline_inference_time_in_ms / result.model_inference_time_in_ms, 2
190+
# Measure inference time for compiled quantized model
191+
print("Benchmarking quantized model.....")
192+
m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
193+
result.quantized_model_compiled_inference_time_in_ms = (
194+
model_inference_time_in_ms(model=m_copy, input_data=input_data)
195+
)
196+
197+
# Compute eager speedup relative to baseline
198+
result.eager_speedup_on_baseline = round(
199+
result.baseline_model_eager_inference_time_in_ms
200+
/ result.quantized_model_eager_inference_time_in_ms,
201+
ndigits=2,
202+
)
203+
# Compute compile speedup relative to baseline
204+
result.compile_speedup_on_baseline = round(
205+
result.baseline_model_compiled_inference_time_in_ms
206+
/ result.quantized_model_compiled_inference_time_in_ms,
207+
ndigits=2,
208+
)
209+
# Compute compile speedup for quantized model relative to eager quantized model
210+
result.compile_speedup_on_eager = round(
211+
result.quantized_model_eager_inference_time_in_ms
212+
/ result.quantized_model_compiled_inference_time_in_ms,
213+
ndigits=2,
122214
)
123215

124216
# Run profiler if enabled
@@ -165,9 +257,9 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
165257
result.memory_profile_path
166258
)
167259
except ValueError as e:
168-
if "not enough values to unpack" in e:
260+
if "not enough values to unpack" in str(e):
169261
print(
170-
"Failed due to existing bugs, re-run the code to generate memory profile. Please raise an issue if it persists."
262+
"Failed due to existing bugs, rerun the code to generate memory profile. Please raise an issue if it persists."
171263
)
172264
except Exception as e:
173265
print(f"Error running memory profiler: {e}")

benchmarks/microbenchmarks/benchmark_runner.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,6 @@ def get_quantization_sparsity_recipes(
139139
"""
140140
config_recipes = set()
141141

142-
# Always include baseline without sparsity
143-
config_recipes.add(("baseline", None))
144-
145142
# Add all quantization techniques without sparsity
146143
for quant_config in quantization_recipes:
147144
config_recipes.add((quant_config, None))

benchmarks/microbenchmarks/test/benchmark_config.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ model_params:
1313
min_power: 14
1414
max_power: 16
1515
high_precision_dtype: "torch.bfloat16"
16-
use_torch_compile: true
1716
torch_compile_mode: "max-autotune"
1817
device: "cuda"
1918
model_type: "linear"
@@ -27,7 +26,6 @@ model_params:
2726
[2048, 4096, 1024],
2827
]
2928
high_precision_dtype: "torch.bfloat16"
30-
use_torch_compile: true
3129
torch_compile_mode: "max-autotune"
3230
device: "cuda"
3331
model_type: "ln_linear_sigmoid"
@@ -41,7 +39,6 @@ model_params:
4139
[2048, 4096, 1024], # For transformer_block, k is the hidden dimension
4240
]
4341
high_precision_dtype: "torch.bfloat16"
44-
use_torch_compile: true
4542
torch_compile_mode: "max-autotune"
4643
device: "cuda"
4744
model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition)
@@ -58,7 +55,6 @@ model_params:
5855
min_power: 10 # 1024
5956
max_power: 11 # 2048
6057
high_precision_dtype: "torch.bfloat16"
61-
use_torch_compile: true
6258
torch_compile_mode: "max-autotune"
6359
device: "cuda"
6460
model_type: "linear"

benchmarks/microbenchmarks/test/test_benchmark_inference.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def setUp(self):
2121
sparsity="semi-sparse",
2222
params={
2323
"high_precision_dtype": "torch.float32",
24-
"use_torch_compile": False,
2524
"device": "cpu",
2625
"model_type": "linear",
2726
},
@@ -46,7 +45,9 @@ def test_run_inference(self, mock_string_to_config):
4645

4746
result = run(self.config)
4847
self.assertIsInstance(result, BenchmarkResult)
49-
self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
48+
self.assertTrue(
49+
hasattr(result, "quantized_model_compiled_inference_time_in_ms")
50+
)
5051

5152
@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
5253
def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
@@ -64,7 +65,6 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
6465
sparsity="semi-sparse",
6566
params={
6667
"high_precision_dtype": "torch.float32",
67-
"use_torch_compile": False,
6868
"device": "cpu",
6969
"model_type": "linear",
7070
},
@@ -75,7 +75,9 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
7575
)
7676
result = run(config)
7777
self.assertIsInstance(result, BenchmarkResult)
78-
self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
78+
self.assertTrue(
79+
hasattr(result, "quantized_model_compiled_inference_time_in_ms")
80+
)
7981

8082
@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
8183
def test_run_inference_with_block_sparsity(self, mock_string_to_config):
@@ -92,7 +94,6 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config):
9294
sparsity="block",
9395
params={
9496
"high_precision_dtype": "torch.float32",
95-
"use_torch_compile": False,
9697
"device": "cpu",
9798
"model_type": "linear",
9899
},
@@ -103,7 +104,9 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config):
103104
)
104105
result = run(config)
105106
self.assertIsInstance(result, BenchmarkResult)
106-
self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
107+
self.assertTrue(
108+
hasattr(result, "quantized_model_compiled_inference_time_in_ms")
109+
)
107110

108111

109112
if __name__ == "__main__":

benchmarks/microbenchmarks/test/test_benchmark_profiler.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,13 +270,12 @@ def test_memory_profiler_cuda_unavailable(self):
270270
f"{config.name}_{self.m}_{self.k}_{self.n}_memory_profile.json",
271271
)
272272

273-
# Generate memory profile
274-
result, memory_stats = generate_memory_profile(
275-
self.model, self.input_data, memory_profile_path
276-
)
277-
278273
# Should return None when CUDA is unavailable
279-
self.assertIsNone(result)
274+
self.assertIsNone(
275+
generate_memory_profile(
276+
self.model, self.input_data, memory_profile_path
277+
)
278+
)
280279

281280
# Should not create file when CUDA is unavailable
282281
self.assertFalse(os.path.exists(memory_profile_path))

benchmarks/microbenchmarks/test/test_benchmark_runner.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ def setUp(self):
3939
}
4040
],
4141
"high_precision_dtype": "torch.bfloat16",
42-
"use_torch_compile": True,
4342
"torch_compile_mode": "max-autotune",
4443
"device": "cpu",
4544
"model_type": "linear",
@@ -130,7 +129,6 @@ def test_get_param_combinations(self):
130129
self.assertEqual(len(shapes), 1)
131130
self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024]))
132131
self.assertEqual(params["high_precision_dtype"], "torch.bfloat16")
133-
self.assertEqual(params["use_torch_compile"], True)
134132

135133
@patch("argparse.Namespace")
136134
def test_load_benchmark_configs(self, mock_args):

0 commit comments

Comments
 (0)