Skip to content

Commit f03df2b

Browse files
committed
Support analysis along different metrics in the dataset
1 parent 752f6a7 commit f03df2b

File tree

1 file changed

+97
-107
lines changed

1 file changed

+97
-107
lines changed

.ci/scripts/benchmark_tooling/analyze_benchmark_stability.py

Lines changed: 97 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@ def is_matching_dataset(primary_sheet, reference_sheet):
6666

6767

6868
def analyze_latency_stability( # noqa: C901
69-
primary_file, reference_file=None, output_dir="stability_analysis_results"
69+
target_metric,
70+
primary_file,
71+
reference_file=None,
72+
output_dir="stability_analysis_results",
73+
verbose_level=0,
7074
):
7175
print(f"Analyzing latency stability from primary file: {primary_file}")
7276
if reference_file:
@@ -99,31 +103,19 @@ def analyze_latency_stability( # noqa: C901
99103

100104
model, full_device, base_device, os_version = parse_model_device_config(config)
101105

102-
# Check if required columns exist
103-
required_cols = ["avg_inference_latency(ms)", "metadata_info.timestamp"]
104-
if "trimmean_inference_latency(ms)" in df.columns:
105-
trimmed_col = "trimmean_inference_latency(ms)"
106-
required_cols.append(trimmed_col)
107-
else:
108-
trimmed_col = None
109-
110-
if "TPS" in df.columns:
111-
tps_col = "TPS"
112-
required_cols.append(tps_col)
113-
else:
114-
tps_col = None
115-
116106
# Skip sheets without required columns
107+
required_cols = [target_metric, "metadata_info.timestamp"]
117108
if not all(col in df.columns for col in required_cols):
118109
print(f" Skipping {sheetName}: Missing required columns")
119110
continue
120111

121112
# Convert Date to datetime
122113
df["Date"] = pd.to_datetime(df["metadata_info.timestamp"])
123114

124-
# Calculate stability metrics
115+
# Calculate stability metrics along the target column in the dataset
125116
metrics = calculate_stability_metrics(
126-
df, "avg_inference_latency(ms)", trimmed_col, tps_col
117+
df,
118+
target_metric,
127119
)
128120

129121
primary_datasets[sheetName] = {
@@ -161,21 +153,8 @@ def analyze_latency_stability( # noqa: C901
161153
config
162154
)
163155

164-
# Check if required columns exist
165-
required_cols = ["avg_inference_latency(ms)", "metadata_info.timestamp"]
166-
if "trimmean_inference_latency(ms)" in df.columns:
167-
trimmed_col = "trimmean_inference_latency(ms)"
168-
required_cols.append(trimmed_col)
169-
else:
170-
trimmed_col = None
171-
172-
if "TPS" in df.columns:
173-
tps_col = "TPS"
174-
required_cols.append(tps_col)
175-
else:
176-
tps_col = None
177-
178156
# Skip sheets without required columns
157+
required_cols = [target_metric, "metadata_info.timestamp"]
179158
if not all(col in df.columns for col in required_cols):
180159
print(
181160
f" Skipping reference {sheetName}: Missing required columns{required_cols}"
@@ -187,7 +166,8 @@ def analyze_latency_stability( # noqa: C901
187166

188167
# Calculate stability metrics
189168
metrics = calculate_stability_metrics(
190-
df, "avg_inference_latency(ms)", trimmed_col, tps_col
169+
df,
170+
target_metric,
191171
)
192172

193173
reference_datasets[sheetName] = {
@@ -201,30 +181,33 @@ def analyze_latency_stability( # noqa: C901
201181
}
202182

203183
# Process primary datasets
204-
print_section_header("ANALYZING PRIMARY DATASETS")
205-
for sheet, info in primary_datasets.items():
206-
# Generate dataset report
207-
generate_dataset_report(
208-
sheet,
209-
info["model"],
210-
info["full_device"],
211-
"Primary",
212-
info["df"],
213-
info["metrics"],
214-
output_dir,
215-
)
184+
if verbose_level > 2:
185+
print_section_header("ANALYZING PRIMARY DATASETS")
186+
for sheet, info in primary_datasets.items():
187+
# Generate dataset report
188+
generate_dataset_report(
189+
sheet,
190+
target_metric,
191+
info["model"],
192+
info["full_device"],
193+
"Primary",
194+
info["df"],
195+
info["metrics"],
196+
output_dir,
197+
)
216198

217-
# Generate time series plot
218-
if len(info["df"]) > 5: # Only create plot if enough data points
219-
generate_time_series_plot(sheet, info["df"], output_dir, "Primary")
199+
# Generate time series plot
200+
if len(info["df"]) > 5: # Only create plot if enough data points
201+
generate_time_series_plot(sheet, info["df"], output_dir, "Primary")
220202

221203
# Process reference datasets if provided
222-
if reference_file:
204+
if reference_file and verbose_level > 3:
223205
print_section_header("ANALYZING REFERENCE DATASETS")
224206
for sheet, info in reference_datasets.items():
225207
# Generate dataset report
226208
generate_dataset_report(
227209
sheet,
210+
target_metric,
228211
info["model"],
229212
info["full_device"],
230213
"Reference",
@@ -238,7 +221,7 @@ def analyze_latency_stability( # noqa: C901
238221
generate_time_series_plot(sheet, info["df"], output_dir, "Reference")
239222

240223
# Generate comparison reports for matching datasets
241-
if reference_file:
224+
if reference_file and verbose_level > 1:
242225
print_section_header("PRIVATE VS PUBLIC STABILITY COMPARISON")
243226
matches_found = False
244227

@@ -270,9 +253,10 @@ def analyze_latency_stability( # noqa: C901
270253
if not matches_found:
271254
print("No matching datasets found between primary and reference files.")
272255

273-
# Generate intra-primary summary (comparing across different models/devices)
274-
print_section_header("INTRA-PRIMARY STABILITY COMPARISON")
275-
generate_intra_primary_summary(primary_datasets, output_dir)
256+
if verbose_level > 0:
257+
# Generate intra-primary summary (comparing across different models/devices)
258+
print_section_header("INTRA-PRIMARY STABILITY COMPARISON")
259+
generate_intra_primary_summary(primary_datasets, output_dir)
276260

277261
# Generate summary report for all datasets
278262
print_section_header("COMPREHENSIVE STABILITY SUMMARY")
@@ -285,28 +269,17 @@ def analyze_latency_stability( # noqa: C901
285269

286270

287271
def calculate_stability_metrics( # noqa: C901
288-
df, raw_col, trimmed_col=None, tps_col=None
272+
df,
273+
target_metric,
289274
):
290275
"""Calculate stability metrics for the given dataset"""
291276
metrics = {}
292-
293-
# Extract data
294-
raw_latency = df[raw_col].values
295-
if trimmed_col and trimmed_col in df.columns:
296-
trimmed_latency = df[trimmed_col].values
297-
else:
298-
trimmed_latency = None
299-
if tps_col and tps_col in df.columns:
300-
tps = df[tps_col].values
301-
else:
302-
tps = None
277+
# Extract data and ingore NaN values
278+
raw_latency = df[target_metric].dropna().values
303279

304280
# Central tendency metrics
305281
metrics["mean_raw_latency"] = np.mean(raw_latency)
306282
metrics["median_raw_latency"] = np.median(raw_latency)
307-
if trimmed_latency is not None:
308-
metrics["mean_trimmed_latency"] = np.mean(trimmed_latency)
309-
metrics["median_trimmed_latency"] = np.median(trimmed_latency)
310283

311284
# Dispersion metrics
312285
metrics["std_raw_latency"] = np.std(raw_latency, ddof=1)
@@ -316,20 +289,10 @@ def calculate_stability_metrics( # noqa: C901
316289
metrics["iqr_raw_latency"] = np.percentile(raw_latency, 75) - np.percentile(
317290
raw_latency, 25
318291
)
319-
if trimmed_latency is not None:
320-
metrics["std_trimmed_latency"] = np.std(trimmed_latency, ddof=1)
321-
metrics["cv_trimmed_latency"] = (
322-
metrics["std_trimmed_latency"] / metrics["mean_trimmed_latency"]
323-
) * 100
324-
metrics["iqr_trimmed_latency"] = np.percentile(
325-
trimmed_latency, 75
326-
) - np.percentile(trimmed_latency, 25)
327292

328293
# Percentile metrics
329294
for p in [50, 90, 95, 99]:
330295
metrics[f"p{p}_raw_latency"] = np.percentile(raw_latency, p)
331-
if trimmed_latency is not None:
332-
metrics[f"p{p}_trimmed_latency"] = np.percentile(trimmed_latency, p)
333296

334297
# Inter-jitter metrics (variability between runs)
335298
if np.min(raw_latency) > 0:
@@ -342,37 +305,45 @@ def calculate_stability_metrics( # noqa: C901
342305
metrics["p99_raw_latency"] / metrics["p50_raw_latency"]
343306
)
344307

345-
if trimmed_latency is not None:
346-
if np.min(trimmed_latency) > 0:
347-
metrics["max_min_range_ratio_trimmed"] = np.max(trimmed_latency) / np.min(
348-
trimmed_latency
349-
)
350-
else:
351-
metrics["max_min_range_ratio_trimmed"] = float("inf")
352-
print(
353-
"Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
308+
# Intra-jitter proxy (if both raw and trimmed latency are available)
309+
trimmed_metric_col = "trimmean_inference_latency(ms)"
310+
if (
311+
target_metric == "avg_inference_latency(ms)"
312+
and trimmed_metric_col in df.columns
313+
):
314+
trimmed_latency = df[trimmed_metric_col].values
315+
if trimmed_latency is not None:
316+
metrics["mean_trimmed_latency"] = np.mean(trimmed_latency)
317+
metrics["median_trimmed_latency"] = np.median(trimmed_latency)
318+
metrics["std_trimmed_latency"] = np.std(trimmed_latency, ddof=1)
319+
metrics["cv_trimmed_latency"] = (
320+
metrics["std_trimmed_latency"] / metrics["mean_trimmed_latency"]
321+
) * 100
322+
metrics["iqr_trimmed_latency"] = np.percentile(
323+
trimmed_latency, 75
324+
) - np.percentile(trimmed_latency, 25)
325+
for p in [50, 90, 95, 99]:
326+
metrics[f"p{p}_trimmed_latency"] = np.percentile(trimmed_latency, p)
327+
if np.min(trimmed_latency) > 0:
328+
metrics["max_min_range_ratio_trimmed"] = np.max(
329+
trimmed_latency
330+
) / np.min(trimmed_latency)
331+
else:
332+
metrics["max_min_range_ratio_trimmed"] = float("inf")
333+
print(
334+
"Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
335+
)
336+
metrics["p99_p50_ratio_trimmed"] = (
337+
metrics["p99_trimmed_latency"] / metrics["p50_trimmed_latency"]
354338
)
355-
356-
metrics["p99_p50_ratio_trimmed"] = (
357-
metrics["p99_trimmed_latency"] / metrics["p50_trimmed_latency"]
358-
)
359-
360-
# Intra-jitter proxy (if both raw and trimmed are available)
361-
if trimmed_latency is not None:
362-
trimming_effect = (raw_latency - trimmed_latency) / raw_latency
363-
metrics["mean_trimming_effect_ratio"] = np.mean(trimming_effect)
364-
metrics["max_trimming_effect_ratio"] = np.max(trimming_effect)
365-
366-
# TPS metrics
367-
if tps is not None:
368-
metrics["mean_tps"] = np.mean(tps)
369-
metrics["std_tps"] = np.std(tps, ddof=1)
370-
metrics["cv_tps"] = (metrics["std_tps"] / metrics["mean_tps"]) * 100
339+
trimming_effect = (raw_latency - trimmed_latency) / raw_latency
340+
metrics["mean_trimming_effect_ratio"] = np.mean(trimming_effect)
341+
metrics["max_trimming_effect_ratio"] = np.max(trimming_effect)
371342

372343
# Time-based stability (rolling window of 5 samples)
373344
if len(df) >= 5:
374345
df_sorted = df.sort_values("Date")
375-
rolling_std = df_sorted[raw_col].rolling(window=5).std()
346+
rolling_std = df_sorted[target_metric].rolling(window=5).std()
376347
metrics["mean_rolling_std"] = rolling_std.mean()
377348
metrics["max_rolling_std"] = rolling_std.max()
378349

@@ -419,7 +390,7 @@ def calculate_stability_metrics( # noqa: C901
419390

420391

421392
def generate_dataset_report( # noqa: C901
422-
sheet_name, model, device, dataset_type, df, metrics, output_dir
393+
sheet_name, target_column, model, device, dataset_type, df, metrics, output_dir
423394
):
424395
"""Generate a detailed report for a single dataset"""
425396
report_file = f"{output_dir}/{sheet_name}_{dataset_type.lower()}_report.txt"
@@ -436,7 +407,9 @@ def generate_dataset_report( # noqa: C901
436407

437408
# Dataset overview
438409
report_content.append("Dataset Overview:")
439-
report_content.append(f" - Number of samples: {len(df)}")
410+
report_content.append(
411+
f" - Number of samples: {len(df[target_column].dropna().values)}"
412+
)
440413
report_content.append(f" - Date range: {df['Date'].min()} to {df['Date'].max()}")
441414
report_content.append("")
442415

@@ -1541,17 +1514,34 @@ def main():
15411514
help="Path to Excel file containing reference (public) benchmark data for comparison",
15421515
default=None,
15431516
)
1517+
parser.add_argument(
1518+
"--metric",
1519+
help="Target metric to analyze (default: avg_inference_latency(ms)). Examples: avg_inference_latency(ms), token_per_sec",
1520+
default="avg_inference_latency(ms)",
1521+
)
15441522
parser.add_argument(
15451523
"--output-dir",
15461524
default="stability_analysis_results",
15471525
help="Directory to save analysis results (default: stability_analysis_results)",
15481526
)
1549-
1527+
parser.add_argument(
1528+
"--verbose-level",
1529+
type=int,
1530+
default=0,
1531+
choices=range(4),
1532+
help="Verbose level 0-3 (default: 0) to control analysis output detail. Higher values show more detailed results.",
1533+
)
15501534
# Parse arguments
15511535
args = parser.parse_args()
15521536

15531537
# Run analysis
1554-
analyze_latency_stability(args.primary_file, args.reference_file, args.output_dir)
1538+
analyze_latency_stability(
1539+
args.metric,
1540+
args.primary_file,
1541+
args.reference_file,
1542+
args.output_dir,
1543+
args.verbose_level,
1544+
)
15551545

15561546

15571547
if __name__ == "__main__":

0 commit comments

Comments
 (0)