Skip to content

Commit f1b5947

Browse files
authored
Support analysis along different metrics in the dataset (#11937)
### Summary - Allow running benchmark analysis along target the metric in the dataset - Set verbose level control how much details to be reported - Bug fixes to properly handle `nan` value in the dataset ### Test plan Analysis the reported metrics stability along the `token_per_sec` for `Qwen3-0.6B` on all devices with all recipes (hf/optimum-et vs etLLM): `python .ci/scripts/benchmark_tooling/analyze_benchmark_stability.py --primary-file private.xlsx --reference-file public.xlsx --metric token_per_sec --verbose-level 0` Report results: ``` ==================================================================================================== ===== Analyzing Stability Against Metric 'token_per_sec' ========================================== ==================================================================================================== Primary dataset: private.xlsx Reference dataset for comparison: public.xlsx ==================================================================================================== ===== LOADING PRIMARY DATASETS (Private) ========================================================== ==================================================================================================== successfully fetched 10 sheets from private.xlsx Loading dataset: table1 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 (private)', 'arch': 'iOS 18.0', 'total_rows': 59, 'aws_type': 'private'} Loading dataset: table2 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 Plus (private)', 'arch': 'iOS 17.4.1', 'total_rows': 58, 'aws_type': 'private'} Loading dataset: table3 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 Pro (private)', 'arch': 'iOS 18.4.1', 'total_rows': 59, 'aws_type': 'private'} Loading dataset: table4 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Samsung Galaxy S22 5G (private)', 'arch': 'Android 13', 'total_rows': 79, 'aws_type': 'private'} Loading dataset: table5 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Samsung Galaxy S22 Ultra 5G (private)', 'arch': 'Android 14', 'total_rows': 79, 'aws_type': 'private'} Loading dataset: table6 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 (private)', 'arch': 'iOS 18.0', 'total_rows': 57, 'aws_type': 'private'} Loading dataset: table7 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 Plus (private)', 'arch': 'iOS 17.4.1', 'total_rows': 57, 'aws_type': 'private'} Loading dataset: table8 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 Pro (private)', 'arch': 'iOS 18.4.1', 'total_rows': 57, 'aws_type': 'private'} Loading dataset: table9 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Samsung Galaxy S22 5G (private)', 'arch': 'Android 13', 'total_rows': 78, 'aws_type': 'private'} Loading dataset: table10 with config: {'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Samsung Galaxy S22 Ultra 5G (private)', 'arch': 'Android 14', 'total_rows': 78, 'aws_type': 'private'} ==================================================================================================== ===== LOADING REFERENCE DATASETS (Public) ========================================================= ==================================================================================================== successfully fetched 6 sheets from public.xlsx Loading dataset: table1 with config:{'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15', 'arch': 'iOS 18.0', 'total_rows': 45, 'aws_type': 'public'} Loading dataset: table2 with config:{'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 Plus', 'arch': 'iOS 17.4.1', 'total_rows': 43, 'aws_type': 'public'} Loading dataset: table3 with config:{'model': 'Qwen/Qwen3-0.6B', 'backend': 'et_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Samsung Galaxy S22 5G', 'arch': 'Android 13', 'total_rows': 71, 'aws_type': 'public'} Loading dataset: table4 with config:{'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15', 'arch': 'iOS 18.0', 'total_rows': 43, 'aws_type': 'public'} Loading dataset: table5 with config:{'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Apple iPhone 15 Plus', 'arch': 'iOS 17.4.1', 'total_rows': 42, 'aws_type': 'public'} Loading dataset: table6 with config:{'model': 'Qwen/Qwen3-0.6B', 'backend': 'hf_xnnpack_custom_spda_kv_cache_8da4w', 'device': 'Samsung Galaxy S22 5G', 'arch': 'Android 13', 'total_rows': 71, 'aws_type': 'public'} ==================================================================================================== ===== COMPREHENSIVE STABILITY SUMMARY ============================================================= ==================================================================================================== Comprehensive Latency Stability Analysis Summary ================================================================================ Primary (Private) Datasets Summary: +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | Dataset | Model | Device | Mean Value | CV (%) | Stability Score | Stability Rating | +===========+========================================================+===================================================+==============+==========+===================+====================+ | table10 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Samsung Galaxy S22 Ultra 5G (private)(Android 14) | 62.82 | 1.45 | 91.17 | Excellent | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table9 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Samsung Galaxy S22 5G (private)(Android 13) | 61.79 | 1.85 | 88.38 | Good | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table5 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Samsung Galaxy S22 Ultra 5G (private)(Android 14) | 64.65 | 2.32 | 86.10 | Good | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table4 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Samsung Galaxy S22 5G (private)(Android 13) | 62.27 | 3.02 | 81.37 | Good | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table3 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 Pro (private)(iOS 18.4.1) | 24.69 | 3.39 | 78.78 | Moderate | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table8 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 Pro (private)(iOS 18.4.1) | 22.88 | 3.65 | 78.23 | Moderate | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table1 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 (private)(iOS 18.0) | 7.66 | 3.75 | 76.56 | Moderate | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table6 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 (private)(iOS 18.0) | 7.14 | 4.18 | 73.67 | Moderate | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table2 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 Plus (private)(iOS 17.4.1) | 6.52 | 4.36 | 73.08 | Moderate | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ | table7 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 Plus (private)(iOS 17.4.1) | 6.11 | 4.50 | 72.90 | Moderate | +-----------+--------------------------------------------------------+---------------------------------------------------+--------------+----------+-------------------+--------------------+ Reference (Public) Datasets Summary: +-----------+--------------------------------------------------------+-----------------------------------+--------------+----------+-------------------+--------------------+ | Dataset | Model | Device | Mean Value | CV (%) | Stability Score | Stability Rating | +===========+========================================================+===================================+==============+==========+===================+====================+ | table6 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Samsung Galaxy S22 5G(Android 13) | 62.78 | 3.72 | 77.73 | Moderate | +-----------+--------------------------------------------------------+-----------------------------------+--------------+----------+-------------------+--------------------+ | table3 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Samsung Galaxy S22 5G(Android 13) | 62.68 | 4.30 | 74.12 | Moderate | +-----------+--------------------------------------------------------+-----------------------------------+--------------+----------+-------------------+--------------------+ | table2 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 Plus(iOS 17.4.1) | 7.08 | 5.21 | 67.91 | Moderate | +-----------+--------------------------------------------------------+-----------------------------------+--------------+----------+-------------------+--------------------+ | table5 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15 Plus(iOS 17.4.1) | 6.49 | 5.42 | 67.74 | Moderate | +-----------+--------------------------------------------------------+-----------------------------------+--------------+----------+-------------------+--------------------+ | table4 | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15(iOS 18.0) | 7.03 | 7.17 | 55.51 | Poor | +-----------+--------------------------------------------------------+-----------------------------------+--------------+----------+-------------------+--------------------+ | table1 | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) | Apple iPhone 15(iOS 18.0) | 6.89 | 20.22 | 21.99 | Poor | +-----------+--------------------------------------------------------+-----------------------------------+--------------+----------+-------------------+--------------------+ Private vs Public Comparison: +-------------------------------------------------------------------------------------------+----------------------------------------------+------------------------------------+-----------------+----------------+--------------+------------------+-----------------+---------------+ | Dataset | Private Device | Public Device | Private Score | Public Score | Score Diff | Private CV (%) | Public CV (%) | CV Diff (%) | +===========================================================================================+==============================================+====================================+=================+================+==============+==================+=================+===============+ | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) on Apple iPhone 15 (private) | Apple iPhone 15 (private) (iOS 18.0) | Apple iPhone 15 (iOS 18.0) | 76.56 | 21.99 | 54.58 | 3.75 | 20.22 | -16.46 | +-------------------------------------------------------------------------------------------+----------------------------------------------+------------------------------------+-----------------+----------------+--------------+------------------+-----------------+---------------+ | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) on Apple iPhone 15 (private) | Apple iPhone 15 (private) (iOS 18.0) | Apple iPhone 15 (iOS 18.0) | 73.67 | 55.51 | 18.17 | 4.18 | 7.17 | -2.99 | +-------------------------------------------------------------------------------------------+----------------------------------------------+------------------------------------+-----------------+----------------+--------------+------------------+-----------------+---------------+ | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) on Samsung Galaxy S22 5G (private) | Samsung Galaxy S22 5G (private) (Android 13) | Samsung Galaxy S22 5G (Android 13) | 88.38 | 77.73 | 10.64 | 1.85 | 3.72 | -1.87 | +-------------------------------------------------------------------------------------------+----------------------------------------------+------------------------------------+-----------------+----------------+--------------+------------------+-----------------+---------------+ | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) on Samsung Galaxy S22 5G (private) | Samsung Galaxy S22 5G (private) (Android 13) | Samsung Galaxy S22 5G (Android 13) | 81.37 | 74.12 | 7.25 | 3.02 | 4.30 | -1.28 | +-------------------------------------------------------------------------------------------+----------------------------------------------+------------------------------------+-----------------+----------------+--------------+------------------+-----------------+---------------+ | Qwen/Qwen3-0.6B(et_xnnpack_custom_spda_kv_cache_8da4w) on Apple iPhone 15 Plus (private) | Apple iPhone 15 Plus (private) (iOS 17.4.1) | Apple iPhone 15 Plus (iOS 17.4.1) | 73.08 | 67.91 | 5.17 | 4.36 | 5.21 | -0.86 | +-------------------------------------------------------------------------------------------+----------------------------------------------+------------------------------------+-----------------+----------------+--------------+------------------+-----------------+---------------+ | Qwen/Qwen3-0.6B(hf_xnnpack_custom_spda_kv_cache_8da4w) on Apple iPhone 15 Plus (private) | Apple iPhone 15 Plus (private) (iOS 17.4.1) | Apple iPhone 15 Plus (iOS 17.4.1) | 72.90 | 67.74 | 5.16 | 4.50 | 5.42 | -0.92 | +-------------------------------------------------------------------------------------------+----------------------------------------------+------------------------------------+-----------------+----------------+--------------+------------------+-----------------+---------------+ Private environment is more stable in 6 of 6 cases. Public environment is more stable in 0 of 6 cases. Overall Insights and Recommendations: Stability Distribution in Private Datasets: - Moderate: 6 dataset(s) - Good: 3 dataset(s) - Excellent: 1 dataset(s) ```
1 parent 124758e commit f1b5947

File tree

1 file changed

+108
-117
lines changed

1 file changed

+108
-117
lines changed

.ci/scripts/benchmark_tooling/analyze_benchmark_stability.py

Lines changed: 108 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,16 @@ def is_matching_dataset(primary_sheet, reference_sheet):
6666

6767

6868
def analyze_latency_stability( # noqa: C901
69-
primary_file, reference_file=None, output_dir="stability_analysis_results"
69+
target_metric,
70+
primary_file,
71+
reference_file=None,
72+
output_dir="stability_analysis_results",
73+
verbose_level=0,
7074
):
71-
print(f"Analyzing latency stability from primary file: {primary_file}")
75+
print_section_header(f"Analyzing Stability Against Metric '{target_metric}'")
76+
print(f"Primary dataset: {primary_file}")
7277
if reference_file:
73-
print(f"Using reference file for comparison: {reference_file}")
78+
print(f"Reference dataset for comparison: {reference_file}")
7479

7580
# Create output directory if it doesn't exist
7681
if not os.path.exists(output_dir):
@@ -99,31 +104,19 @@ def analyze_latency_stability( # noqa: C901
99104

100105
model, full_device, base_device, os_version = parse_model_device_config(config)
101106

102-
# Check if required columns exist
103-
required_cols = ["avg_inference_latency(ms)", "metadata_info.timestamp"]
104-
if "trimmean_inference_latency(ms)" in df.columns:
105-
trimmed_col = "trimmean_inference_latency(ms)"
106-
required_cols.append(trimmed_col)
107-
else:
108-
trimmed_col = None
109-
110-
if "TPS" in df.columns:
111-
tps_col = "TPS"
112-
required_cols.append(tps_col)
113-
else:
114-
tps_col = None
115-
116107
# Skip sheets without required columns
108+
required_cols = [target_metric, "metadata_info.timestamp"]
117109
if not all(col in df.columns for col in required_cols):
118110
print(f" Skipping {sheetName}: Missing required columns")
119111
continue
120112

121113
# Convert Date to datetime
122114
df["Date"] = pd.to_datetime(df["metadata_info.timestamp"])
123115

124-
# Calculate stability metrics
116+
# Calculate stability metrics along the target column in the dataset
125117
metrics = calculate_stability_metrics(
126-
df, "avg_inference_latency(ms)", trimmed_col, tps_col
118+
df,
119+
target_metric,
127120
)
128121

129122
primary_datasets[sheetName] = {
@@ -161,21 +154,8 @@ def analyze_latency_stability( # noqa: C901
161154
config
162155
)
163156

164-
# Check if required columns exist
165-
required_cols = ["avg_inference_latency(ms)", "metadata_info.timestamp"]
166-
if "trimmean_inference_latency(ms)" in df.columns:
167-
trimmed_col = "trimmean_inference_latency(ms)"
168-
required_cols.append(trimmed_col)
169-
else:
170-
trimmed_col = None
171-
172-
if "TPS" in df.columns:
173-
tps_col = "TPS"
174-
required_cols.append(tps_col)
175-
else:
176-
tps_col = None
177-
178157
# Skip sheets without required columns
158+
required_cols = [target_metric, "metadata_info.timestamp"]
179159
if not all(col in df.columns for col in required_cols):
180160
print(
181161
f" Skipping reference {sheetName}: Missing required columns{required_cols}"
@@ -187,7 +167,8 @@ def analyze_latency_stability( # noqa: C901
187167

188168
# Calculate stability metrics
189169
metrics = calculate_stability_metrics(
190-
df, "avg_inference_latency(ms)", trimmed_col, tps_col
170+
df,
171+
target_metric,
191172
)
192173

193174
reference_datasets[sheetName] = {
@@ -201,30 +182,33 @@ def analyze_latency_stability( # noqa: C901
201182
}
202183

203184
# Process primary datasets
204-
print_section_header("ANALYZING PRIMARY DATASETS")
205-
for sheet, info in primary_datasets.items():
206-
# Generate dataset report
207-
generate_dataset_report(
208-
sheet,
209-
info["model"],
210-
info["full_device"],
211-
"Primary",
212-
info["df"],
213-
info["metrics"],
214-
output_dir,
215-
)
185+
if verbose_level > 2:
186+
print_section_header("ANALYZING PRIMARY DATASETS")
187+
for sheet, info in primary_datasets.items():
188+
# Generate dataset report
189+
generate_dataset_report(
190+
sheet,
191+
target_metric,
192+
info["model"],
193+
info["full_device"],
194+
"Primary",
195+
info["df"],
196+
info["metrics"],
197+
output_dir,
198+
)
216199

217-
# Generate time series plot
218-
if len(info["df"]) > 5: # Only create plot if enough data points
219-
generate_time_series_plot(sheet, info["df"], output_dir, "Primary")
200+
# Generate time series plot
201+
if len(info["df"]) > 5: # Only create plot if enough data points
202+
generate_time_series_plot(sheet, info["df"], output_dir, "Primary")
220203

221204
# Process reference datasets if provided
222-
if reference_file:
205+
if reference_file and verbose_level > 3:
223206
print_section_header("ANALYZING REFERENCE DATASETS")
224207
for sheet, info in reference_datasets.items():
225208
# Generate dataset report
226209
generate_dataset_report(
227210
sheet,
211+
target_metric,
228212
info["model"],
229213
info["full_device"],
230214
"Reference",
@@ -238,7 +222,7 @@ def analyze_latency_stability( # noqa: C901
238222
generate_time_series_plot(sheet, info["df"], output_dir, "Reference")
239223

240224
# Generate comparison reports for matching datasets
241-
if reference_file:
225+
if reference_file and verbose_level > 1:
242226
print_section_header("PRIVATE VS PUBLIC STABILITY COMPARISON")
243227
matches_found = False
244228

@@ -270,9 +254,10 @@ def analyze_latency_stability( # noqa: C901
270254
if not matches_found:
271255
print("No matching datasets found between primary and reference files.")
272256

273-
# Generate intra-primary summary (comparing across different models/devices)
274-
print_section_header("INTRA-PRIMARY STABILITY COMPARISON")
275-
generate_intra_primary_summary(primary_datasets, output_dir)
257+
if verbose_level > 0:
258+
# Generate intra-primary summary (comparing across different models/devices)
259+
print_section_header("INTRA-PRIMARY STABILITY COMPARISON")
260+
generate_intra_primary_summary(primary_datasets, output_dir)
276261

277262
# Generate summary report for all datasets
278263
print_section_header("COMPREHENSIVE STABILITY SUMMARY")
@@ -285,28 +270,17 @@ def analyze_latency_stability( # noqa: C901
285270

286271

287272
def calculate_stability_metrics( # noqa: C901
288-
df, raw_col, trimmed_col=None, tps_col=None
273+
df,
274+
target_metric,
289275
):
290276
"""Calculate stability metrics for the given dataset"""
291277
metrics = {}
292-
293-
# Extract data
294-
raw_latency = df[raw_col].values
295-
if trimmed_col and trimmed_col in df.columns:
296-
trimmed_latency = df[trimmed_col].values
297-
else:
298-
trimmed_latency = None
299-
if tps_col and tps_col in df.columns:
300-
tps = df[tps_col].values
301-
else:
302-
tps = None
278+
# Extract data and ingore NaN values
279+
raw_latency = df[target_metric].dropna().values
303280

304281
# Central tendency metrics
305282
metrics["mean_raw_latency"] = np.mean(raw_latency)
306283
metrics["median_raw_latency"] = np.median(raw_latency)
307-
if trimmed_latency is not None:
308-
metrics["mean_trimmed_latency"] = np.mean(trimmed_latency)
309-
metrics["median_trimmed_latency"] = np.median(trimmed_latency)
310284

311285
# Dispersion metrics
312286
metrics["std_raw_latency"] = np.std(raw_latency, ddof=1)
@@ -316,20 +290,10 @@ def calculate_stability_metrics( # noqa: C901
316290
metrics["iqr_raw_latency"] = np.percentile(raw_latency, 75) - np.percentile(
317291
raw_latency, 25
318292
)
319-
if trimmed_latency is not None:
320-
metrics["std_trimmed_latency"] = np.std(trimmed_latency, ddof=1)
321-
metrics["cv_trimmed_latency"] = (
322-
metrics["std_trimmed_latency"] / metrics["mean_trimmed_latency"]
323-
) * 100
324-
metrics["iqr_trimmed_latency"] = np.percentile(
325-
trimmed_latency, 75
326-
) - np.percentile(trimmed_latency, 25)
327293

328294
# Percentile metrics
329295
for p in [50, 90, 95, 99]:
330296
metrics[f"p{p}_raw_latency"] = np.percentile(raw_latency, p)
331-
if trimmed_latency is not None:
332-
metrics[f"p{p}_trimmed_latency"] = np.percentile(trimmed_latency, p)
333297

334298
# Inter-jitter metrics (variability between runs)
335299
if np.min(raw_latency) > 0:
@@ -342,37 +306,45 @@ def calculate_stability_metrics( # noqa: C901
342306
metrics["p99_raw_latency"] / metrics["p50_raw_latency"]
343307
)
344308

345-
if trimmed_latency is not None:
346-
if np.min(trimmed_latency) > 0:
347-
metrics["max_min_range_ratio_trimmed"] = np.max(trimmed_latency) / np.min(
348-
trimmed_latency
349-
)
350-
else:
351-
metrics["max_min_range_ratio_trimmed"] = float("inf")
352-
print(
353-
"Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
309+
# Intra-jitter proxy (if both raw and trimmed latency are available)
310+
trimmed_metric_col = "trimmean_inference_latency(ms)"
311+
if (
312+
target_metric == "avg_inference_latency(ms)"
313+
and trimmed_metric_col in df.columns
314+
):
315+
trimmed_latency = df[trimmed_metric_col].values
316+
if trimmed_latency is not None:
317+
metrics["mean_trimmed_latency"] = np.mean(trimmed_latency)
318+
metrics["median_trimmed_latency"] = np.median(trimmed_latency)
319+
metrics["std_trimmed_latency"] = np.std(trimmed_latency, ddof=1)
320+
metrics["cv_trimmed_latency"] = (
321+
metrics["std_trimmed_latency"] / metrics["mean_trimmed_latency"]
322+
) * 100
323+
metrics["iqr_trimmed_latency"] = np.percentile(
324+
trimmed_latency, 75
325+
) - np.percentile(trimmed_latency, 25)
326+
for p in [50, 90, 95, 99]:
327+
metrics[f"p{p}_trimmed_latency"] = np.percentile(trimmed_latency, p)
328+
if np.min(trimmed_latency) > 0:
329+
metrics["max_min_range_ratio_trimmed"] = np.max(
330+
trimmed_latency
331+
) / np.min(trimmed_latency)
332+
else:
333+
metrics["max_min_range_ratio_trimmed"] = float("inf")
334+
print(
335+
"Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
336+
)
337+
metrics["p99_p50_ratio_trimmed"] = (
338+
metrics["p99_trimmed_latency"] / metrics["p50_trimmed_latency"]
354339
)
355-
356-
metrics["p99_p50_ratio_trimmed"] = (
357-
metrics["p99_trimmed_latency"] / metrics["p50_trimmed_latency"]
358-
)
359-
360-
# Intra-jitter proxy (if both raw and trimmed are available)
361-
if trimmed_latency is not None:
362-
trimming_effect = (raw_latency - trimmed_latency) / raw_latency
363-
metrics["mean_trimming_effect_ratio"] = np.mean(trimming_effect)
364-
metrics["max_trimming_effect_ratio"] = np.max(trimming_effect)
365-
366-
# TPS metrics
367-
if tps is not None:
368-
metrics["mean_tps"] = np.mean(tps)
369-
metrics["std_tps"] = np.std(tps, ddof=1)
370-
metrics["cv_tps"] = (metrics["std_tps"] / metrics["mean_tps"]) * 100
340+
trimming_effect = (raw_latency - trimmed_latency) / raw_latency
341+
metrics["mean_trimming_effect_ratio"] = np.mean(trimming_effect)
342+
metrics["max_trimming_effect_ratio"] = np.max(trimming_effect)
371343

372344
# Time-based stability (rolling window of 5 samples)
373345
if len(df) >= 5:
374346
df_sorted = df.sort_values("Date")
375-
rolling_std = df_sorted[raw_col].rolling(window=5).std()
347+
rolling_std = df_sorted[target_metric].rolling(window=5).std()
376348
metrics["mean_rolling_std"] = rolling_std.mean()
377349
metrics["max_rolling_std"] = rolling_std.max()
378350

@@ -419,7 +391,7 @@ def calculate_stability_metrics( # noqa: C901
419391

420392

421393
def generate_dataset_report( # noqa: C901
422-
sheet_name, model, device, dataset_type, df, metrics, output_dir
394+
sheet_name, target_column, model, device, dataset_type, df, metrics, output_dir
423395
):
424396
"""Generate a detailed report for a single dataset"""
425397
report_file = f"{output_dir}/{sheet_name}_{dataset_type.lower()}_report.txt"
@@ -436,7 +408,9 @@ def generate_dataset_report( # noqa: C901
436408

437409
# Dataset overview
438410
report_content.append("Dataset Overview:")
439-
report_content.append(f" - Number of samples: {len(df)}")
411+
report_content.append(
412+
f" - Number of samples: {len(df[target_column].dropna().values)}"
413+
)
440414
report_content.append(f" - Date range: {df['Date'].min()} to {df['Date'].max()}")
441415
report_content.append("")
442416

@@ -719,12 +693,12 @@ def generate_comparison_report( # noqa: C901
719693

720694
# Add key metrics to the table
721695
metrics_to_compare = [
722-
("Mean Latency (ms)", "mean_raw_latency", "ms"),
723-
("Median Latency (ms)", "median_raw_latency", "ms"),
724-
("Standard Deviation (ms)", "std_raw_latency", "ms"),
696+
("Mean Value", "mean_raw_latency", ""),
697+
("Median Value", "median_raw_latency", ""),
698+
("Standard Deviation", "std_raw_latency", ""),
725699
("CV (%)", "cv_raw_latency", "%"),
726-
("IQR (ms)", "iqr_raw_latency", "ms"),
727-
("P99 (ms)", "p99_raw_latency", "ms"),
700+
("IQR", "iqr_raw_latency", ""),
701+
("P99", "p99_raw_latency", ""),
728702
("Max/Min Ratio", "max_min_range_ratio_raw", ""),
729703
("P99/P50 Ratio", "p99_p50_ratio_raw", ""),
730704
("Stability Score", "stability_score", ""),
@@ -1056,7 +1030,7 @@ def generate_intra_primary_summary(primary_datasets, output_dir): # noqa: C901
10561030
"Sheet": sheet_name,
10571031
"Model": info["model"],
10581032
"Device": info["full_device"],
1059-
"Mean Latency (ms)": info["metrics"]["mean_raw_latency"],
1033+
"Mean Value": info["metrics"]["mean_raw_latency"],
10601034
"CV (%)": info["metrics"]["cv_raw_latency"],
10611035
"Stability Score": info["metrics"]["stability_score"],
10621036
"Stability Rating": info["metrics"]["stability_rating"],
@@ -1293,7 +1267,7 @@ def generate_summary_report( # noqa: C901
12931267
"Dataset": sheet_name,
12941268
"Model": model,
12951269
"Device": device_display,
1296-
"Mean Latency (ms)": info["metrics"]["mean_raw_latency"],
1270+
"Mean Value": info["metrics"]["mean_raw_latency"],
12971271
"CV (%)": info["metrics"]["cv_raw_latency"],
12981272
"Stability Score": info["metrics"]["stability_score"],
12991273
"Stability Rating": info["metrics"]["stability_rating"],
@@ -1330,7 +1304,7 @@ def generate_summary_report( # noqa: C901
13301304
"Dataset": sheet_name,
13311305
"Model": model,
13321306
"Device": device_display,
1333-
"Mean Latency (ms)": info["metrics"]["mean_raw_latency"],
1307+
"Mean Value": info["metrics"]["mean_raw_latency"],
13341308
"CV (%)": info["metrics"]["cv_raw_latency"],
13351309
"Stability Score": info["metrics"]["stability_score"],
13361310
"Stability Rating": info["metrics"]["stability_rating"],
@@ -1541,17 +1515,34 @@ def main():
15411515
help="Path to Excel file containing reference (public) benchmark data for comparison",
15421516
default=None,
15431517
)
1518+
parser.add_argument(
1519+
"--metric",
1520+
help="Target metric to analyze (default: avg_inference_latency(ms)). Examples: avg_inference_latency(ms), token_per_sec",
1521+
default="avg_inference_latency(ms)",
1522+
)
15441523
parser.add_argument(
15451524
"--output-dir",
15461525
default="stability_analysis_results",
15471526
help="Directory to save analysis results (default: stability_analysis_results)",
15481527
)
1549-
1528+
parser.add_argument(
1529+
"--verbose-level",
1530+
type=int,
1531+
default=0,
1532+
choices=range(4),
1533+
help="Verbose level 0-3 (default: 0) to control analysis output detail. Higher values show more detailed results.",
1534+
)
15501535
# Parse arguments
15511536
args = parser.parse_args()
15521537

15531538
# Run analysis
1554-
analyze_latency_stability(args.primary_file, args.reference_file, args.output_dir)
1539+
analyze_latency_stability(
1540+
args.metric,
1541+
args.primary_file,
1542+
args.reference_file,
1543+
args.output_dir,
1544+
args.verbose_level,
1545+
)
15551546

15561547

15571548
if __name__ == "__main__":

0 commit comments

Comments
 (0)