@@ -66,11 +66,16 @@ def is_matching_dataset(primary_sheet, reference_sheet):
66
66
67
67
68
68
def analyze_latency_stability ( # noqa: C901
69
- primary_file , reference_file = None , output_dir = "stability_analysis_results"
69
+ target_metric ,
70
+ primary_file ,
71
+ reference_file = None ,
72
+ output_dir = "stability_analysis_results" ,
73
+ verbose_level = 0 ,
70
74
):
71
- print (f"Analyzing latency stability from primary file: { primary_file } " )
75
+ print_section_header (f"Analyzing Stability Against Metric '{ target_metric } '" )
76
+ print (f"Primary dataset: { primary_file } " )
72
77
if reference_file :
73
- print (f"Using reference file for comparison: { reference_file } " )
78
+ print (f"Reference dataset for comparison: { reference_file } " )
74
79
75
80
# Create output directory if it doesn't exist
76
81
if not os .path .exists (output_dir ):
@@ -99,31 +104,19 @@ def analyze_latency_stability( # noqa: C901
99
104
100
105
model , full_device , base_device , os_version = parse_model_device_config (config )
101
106
102
- # Check if required columns exist
103
- required_cols = ["avg_inference_latency(ms)" , "metadata_info.timestamp" ]
104
- if "trimmean_inference_latency(ms)" in df .columns :
105
- trimmed_col = "trimmean_inference_latency(ms)"
106
- required_cols .append (trimmed_col )
107
- else :
108
- trimmed_col = None
109
-
110
- if "TPS" in df .columns :
111
- tps_col = "TPS"
112
- required_cols .append (tps_col )
113
- else :
114
- tps_col = None
115
-
116
107
# Skip sheets without required columns
108
+ required_cols = [target_metric , "metadata_info.timestamp" ]
117
109
if not all (col in df .columns for col in required_cols ):
118
110
print (f" Skipping { sheetName } : Missing required columns" )
119
111
continue
120
112
121
113
# Convert Date to datetime
122
114
df ["Date" ] = pd .to_datetime (df ["metadata_info.timestamp" ])
123
115
124
- # Calculate stability metrics
116
+ # Calculate stability metrics along the target column in the dataset
125
117
metrics = calculate_stability_metrics (
126
- df , "avg_inference_latency(ms)" , trimmed_col , tps_col
118
+ df ,
119
+ target_metric ,
127
120
)
128
121
129
122
primary_datasets [sheetName ] = {
@@ -161,21 +154,8 @@ def analyze_latency_stability( # noqa: C901
161
154
config
162
155
)
163
156
164
- # Check if required columns exist
165
- required_cols = ["avg_inference_latency(ms)" , "metadata_info.timestamp" ]
166
- if "trimmean_inference_latency(ms)" in df .columns :
167
- trimmed_col = "trimmean_inference_latency(ms)"
168
- required_cols .append (trimmed_col )
169
- else :
170
- trimmed_col = None
171
-
172
- if "TPS" in df .columns :
173
- tps_col = "TPS"
174
- required_cols .append (tps_col )
175
- else :
176
- tps_col = None
177
-
178
157
# Skip sheets without required columns
158
+ required_cols = [target_metric , "metadata_info.timestamp" ]
179
159
if not all (col in df .columns for col in required_cols ):
180
160
print (
181
161
f" Skipping reference { sheetName } : Missing required columns{ required_cols } "
@@ -187,7 +167,8 @@ def analyze_latency_stability( # noqa: C901
187
167
188
168
# Calculate stability metrics
189
169
metrics = calculate_stability_metrics (
190
- df , "avg_inference_latency(ms)" , trimmed_col , tps_col
170
+ df ,
171
+ target_metric ,
191
172
)
192
173
193
174
reference_datasets [sheetName ] = {
@@ -201,30 +182,33 @@ def analyze_latency_stability( # noqa: C901
201
182
}
202
183
203
184
# Process primary datasets
204
- print_section_header ("ANALYZING PRIMARY DATASETS" )
205
- for sheet , info in primary_datasets .items ():
206
- # Generate dataset report
207
- generate_dataset_report (
208
- sheet ,
209
- info ["model" ],
210
- info ["full_device" ],
211
- "Primary" ,
212
- info ["df" ],
213
- info ["metrics" ],
214
- output_dir ,
215
- )
185
+ if verbose_level > 2 :
186
+ print_section_header ("ANALYZING PRIMARY DATASETS" )
187
+ for sheet , info in primary_datasets .items ():
188
+ # Generate dataset report
189
+ generate_dataset_report (
190
+ sheet ,
191
+ target_metric ,
192
+ info ["model" ],
193
+ info ["full_device" ],
194
+ "Primary" ,
195
+ info ["df" ],
196
+ info ["metrics" ],
197
+ output_dir ,
198
+ )
216
199
217
- # Generate time series plot
218
- if len (info ["df" ]) > 5 : # Only create plot if enough data points
219
- generate_time_series_plot (sheet , info ["df" ], output_dir , "Primary" )
200
+ # Generate time series plot
201
+ if len (info ["df" ]) > 5 : # Only create plot if enough data points
202
+ generate_time_series_plot (sheet , info ["df" ], output_dir , "Primary" )
220
203
221
204
# Process reference datasets if provided
222
- if reference_file :
205
+ if reference_file and verbose_level > 3 :
223
206
print_section_header ("ANALYZING REFERENCE DATASETS" )
224
207
for sheet , info in reference_datasets .items ():
225
208
# Generate dataset report
226
209
generate_dataset_report (
227
210
sheet ,
211
+ target_metric ,
228
212
info ["model" ],
229
213
info ["full_device" ],
230
214
"Reference" ,
@@ -238,7 +222,7 @@ def analyze_latency_stability( # noqa: C901
238
222
generate_time_series_plot (sheet , info ["df" ], output_dir , "Reference" )
239
223
240
224
# Generate comparison reports for matching datasets
241
- if reference_file :
225
+ if reference_file and verbose_level > 1 :
242
226
print_section_header ("PRIVATE VS PUBLIC STABILITY COMPARISON" )
243
227
matches_found = False
244
228
@@ -270,9 +254,10 @@ def analyze_latency_stability( # noqa: C901
270
254
if not matches_found :
271
255
print ("No matching datasets found between primary and reference files." )
272
256
273
- # Generate intra-primary summary (comparing across different models/devices)
274
- print_section_header ("INTRA-PRIMARY STABILITY COMPARISON" )
275
- generate_intra_primary_summary (primary_datasets , output_dir )
257
+ if verbose_level > 0 :
258
+ # Generate intra-primary summary (comparing across different models/devices)
259
+ print_section_header ("INTRA-PRIMARY STABILITY COMPARISON" )
260
+ generate_intra_primary_summary (primary_datasets , output_dir )
276
261
277
262
# Generate summary report for all datasets
278
263
print_section_header ("COMPREHENSIVE STABILITY SUMMARY" )
@@ -285,28 +270,17 @@ def analyze_latency_stability( # noqa: C901
285
270
286
271
287
272
def calculate_stability_metrics ( # noqa: C901
288
- df , raw_col , trimmed_col = None , tps_col = None
273
+ df ,
274
+ target_metric ,
289
275
):
290
276
"""Calculate stability metrics for the given dataset"""
291
277
metrics = {}
292
-
293
- # Extract data
294
- raw_latency = df [raw_col ].values
295
- if trimmed_col and trimmed_col in df .columns :
296
- trimmed_latency = df [trimmed_col ].values
297
- else :
298
- trimmed_latency = None
299
- if tps_col and tps_col in df .columns :
300
- tps = df [tps_col ].values
301
- else :
302
- tps = None
278
+ # Extract data and ingore NaN values
279
+ raw_latency = df [target_metric ].dropna ().values
303
280
304
281
# Central tendency metrics
305
282
metrics ["mean_raw_latency" ] = np .mean (raw_latency )
306
283
metrics ["median_raw_latency" ] = np .median (raw_latency )
307
- if trimmed_latency is not None :
308
- metrics ["mean_trimmed_latency" ] = np .mean (trimmed_latency )
309
- metrics ["median_trimmed_latency" ] = np .median (trimmed_latency )
310
284
311
285
# Dispersion metrics
312
286
metrics ["std_raw_latency" ] = np .std (raw_latency , ddof = 1 )
@@ -316,20 +290,10 @@ def calculate_stability_metrics( # noqa: C901
316
290
metrics ["iqr_raw_latency" ] = np .percentile (raw_latency , 75 ) - np .percentile (
317
291
raw_latency , 25
318
292
)
319
- if trimmed_latency is not None :
320
- metrics ["std_trimmed_latency" ] = np .std (trimmed_latency , ddof = 1 )
321
- metrics ["cv_trimmed_latency" ] = (
322
- metrics ["std_trimmed_latency" ] / metrics ["mean_trimmed_latency" ]
323
- ) * 100
324
- metrics ["iqr_trimmed_latency" ] = np .percentile (
325
- trimmed_latency , 75
326
- ) - np .percentile (trimmed_latency , 25 )
327
293
328
294
# Percentile metrics
329
295
for p in [50 , 90 , 95 , 99 ]:
330
296
metrics [f"p{ p } _raw_latency" ] = np .percentile (raw_latency , p )
331
- if trimmed_latency is not None :
332
- metrics [f"p{ p } _trimmed_latency" ] = np .percentile (trimmed_latency , p )
333
297
334
298
# Inter-jitter metrics (variability between runs)
335
299
if np .min (raw_latency ) > 0 :
@@ -342,37 +306,45 @@ def calculate_stability_metrics( # noqa: C901
342
306
metrics ["p99_raw_latency" ] / metrics ["p50_raw_latency" ]
343
307
)
344
308
345
- if trimmed_latency is not None :
346
- if np .min (trimmed_latency ) > 0 :
347
- metrics ["max_min_range_ratio_trimmed" ] = np .max (trimmed_latency ) / np .min (
348
- trimmed_latency
349
- )
350
- else :
351
- metrics ["max_min_range_ratio_trimmed" ] = float ("inf" )
352
- print (
353
- "Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
309
+ # Intra-jitter proxy (if both raw and trimmed latency are available)
310
+ trimmed_metric_col = "trimmean_inference_latency(ms)"
311
+ if (
312
+ target_metric == "avg_inference_latency(ms)"
313
+ and trimmed_metric_col in df .columns
314
+ ):
315
+ trimmed_latency = df [trimmed_metric_col ].values
316
+ if trimmed_latency is not None :
317
+ metrics ["mean_trimmed_latency" ] = np .mean (trimmed_latency )
318
+ metrics ["median_trimmed_latency" ] = np .median (trimmed_latency )
319
+ metrics ["std_trimmed_latency" ] = np .std (trimmed_latency , ddof = 1 )
320
+ metrics ["cv_trimmed_latency" ] = (
321
+ metrics ["std_trimmed_latency" ] / metrics ["mean_trimmed_latency" ]
322
+ ) * 100
323
+ metrics ["iqr_trimmed_latency" ] = np .percentile (
324
+ trimmed_latency , 75
325
+ ) - np .percentile (trimmed_latency , 25 )
326
+ for p in [50 , 90 , 95 , 99 ]:
327
+ metrics [f"p{ p } _trimmed_latency" ] = np .percentile (trimmed_latency , p )
328
+ if np .min (trimmed_latency ) > 0 :
329
+ metrics ["max_min_range_ratio_trimmed" ] = np .max (
330
+ trimmed_latency
331
+ ) / np .min (trimmed_latency )
332
+ else :
333
+ metrics ["max_min_range_ratio_trimmed" ] = float ("inf" )
334
+ print (
335
+ "Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
336
+ )
337
+ metrics ["p99_p50_ratio_trimmed" ] = (
338
+ metrics ["p99_trimmed_latency" ] / metrics ["p50_trimmed_latency" ]
354
339
)
355
-
356
- metrics ["p99_p50_ratio_trimmed" ] = (
357
- metrics ["p99_trimmed_latency" ] / metrics ["p50_trimmed_latency" ]
358
- )
359
-
360
- # Intra-jitter proxy (if both raw and trimmed are available)
361
- if trimmed_latency is not None :
362
- trimming_effect = (raw_latency - trimmed_latency ) / raw_latency
363
- metrics ["mean_trimming_effect_ratio" ] = np .mean (trimming_effect )
364
- metrics ["max_trimming_effect_ratio" ] = np .max (trimming_effect )
365
-
366
- # TPS metrics
367
- if tps is not None :
368
- metrics ["mean_tps" ] = np .mean (tps )
369
- metrics ["std_tps" ] = np .std (tps , ddof = 1 )
370
- metrics ["cv_tps" ] = (metrics ["std_tps" ] / metrics ["mean_tps" ]) * 100
340
+ trimming_effect = (raw_latency - trimmed_latency ) / raw_latency
341
+ metrics ["mean_trimming_effect_ratio" ] = np .mean (trimming_effect )
342
+ metrics ["max_trimming_effect_ratio" ] = np .max (trimming_effect )
371
343
372
344
# Time-based stability (rolling window of 5 samples)
373
345
if len (df ) >= 5 :
374
346
df_sorted = df .sort_values ("Date" )
375
- rolling_std = df_sorted [raw_col ].rolling (window = 5 ).std ()
347
+ rolling_std = df_sorted [target_metric ].rolling (window = 5 ).std ()
376
348
metrics ["mean_rolling_std" ] = rolling_std .mean ()
377
349
metrics ["max_rolling_std" ] = rolling_std .max ()
378
350
@@ -419,7 +391,7 @@ def calculate_stability_metrics( # noqa: C901
419
391
420
392
421
393
def generate_dataset_report ( # noqa: C901
422
- sheet_name , model , device , dataset_type , df , metrics , output_dir
394
+ sheet_name , target_column , model , device , dataset_type , df , metrics , output_dir
423
395
):
424
396
"""Generate a detailed report for a single dataset"""
425
397
report_file = f"{ output_dir } /{ sheet_name } _{ dataset_type .lower ()} _report.txt"
@@ -436,7 +408,9 @@ def generate_dataset_report( # noqa: C901
436
408
437
409
# Dataset overview
438
410
report_content .append ("Dataset Overview:" )
439
- report_content .append (f" - Number of samples: { len (df )} " )
411
+ report_content .append (
412
+ f" - Number of samples: { len (df [target_column ].dropna ().values )} "
413
+ )
440
414
report_content .append (f" - Date range: { df ['Date' ].min ()} to { df ['Date' ].max ()} " )
441
415
report_content .append ("" )
442
416
@@ -719,12 +693,12 @@ def generate_comparison_report( # noqa: C901
719
693
720
694
# Add key metrics to the table
721
695
metrics_to_compare = [
722
- ("Mean Latency (ms) " , "mean_raw_latency" , "ms " ),
723
- ("Median Latency (ms) " , "median_raw_latency" , "ms " ),
724
- ("Standard Deviation (ms) " , "std_raw_latency" , "ms " ),
696
+ ("Mean Value " , "mean_raw_latency" , "" ),
697
+ ("Median Value " , "median_raw_latency" , "" ),
698
+ ("Standard Deviation" , "std_raw_latency" , "" ),
725
699
("CV (%)" , "cv_raw_latency" , "%" ),
726
- ("IQR (ms) " , "iqr_raw_latency" , "ms " ),
727
- ("P99 (ms) " , "p99_raw_latency" , "ms " ),
700
+ ("IQR" , "iqr_raw_latency" , "" ),
701
+ ("P99" , "p99_raw_latency" , "" ),
728
702
("Max/Min Ratio" , "max_min_range_ratio_raw" , "" ),
729
703
("P99/P50 Ratio" , "p99_p50_ratio_raw" , "" ),
730
704
("Stability Score" , "stability_score" , "" ),
@@ -1056,7 +1030,7 @@ def generate_intra_primary_summary(primary_datasets, output_dir): # noqa: C901
1056
1030
"Sheet" : sheet_name ,
1057
1031
"Model" : info ["model" ],
1058
1032
"Device" : info ["full_device" ],
1059
- "Mean Latency (ms) " : info ["metrics" ]["mean_raw_latency" ],
1033
+ "Mean Value " : info ["metrics" ]["mean_raw_latency" ],
1060
1034
"CV (%)" : info ["metrics" ]["cv_raw_latency" ],
1061
1035
"Stability Score" : info ["metrics" ]["stability_score" ],
1062
1036
"Stability Rating" : info ["metrics" ]["stability_rating" ],
@@ -1293,7 +1267,7 @@ def generate_summary_report( # noqa: C901
1293
1267
"Dataset" : sheet_name ,
1294
1268
"Model" : model ,
1295
1269
"Device" : device_display ,
1296
- "Mean Latency (ms) " : info ["metrics" ]["mean_raw_latency" ],
1270
+ "Mean Value " : info ["metrics" ]["mean_raw_latency" ],
1297
1271
"CV (%)" : info ["metrics" ]["cv_raw_latency" ],
1298
1272
"Stability Score" : info ["metrics" ]["stability_score" ],
1299
1273
"Stability Rating" : info ["metrics" ]["stability_rating" ],
@@ -1330,7 +1304,7 @@ def generate_summary_report( # noqa: C901
1330
1304
"Dataset" : sheet_name ,
1331
1305
"Model" : model ,
1332
1306
"Device" : device_display ,
1333
- "Mean Latency (ms) " : info ["metrics" ]["mean_raw_latency" ],
1307
+ "Mean Value " : info ["metrics" ]["mean_raw_latency" ],
1334
1308
"CV (%)" : info ["metrics" ]["cv_raw_latency" ],
1335
1309
"Stability Score" : info ["metrics" ]["stability_score" ],
1336
1310
"Stability Rating" : info ["metrics" ]["stability_rating" ],
@@ -1541,17 +1515,34 @@ def main():
1541
1515
help = "Path to Excel file containing reference (public) benchmark data for comparison" ,
1542
1516
default = None ,
1543
1517
)
1518
+ parser .add_argument (
1519
+ "--metric" ,
1520
+ help = "Target metric to analyze (default: avg_inference_latency(ms)). Examples: avg_inference_latency(ms), token_per_sec" ,
1521
+ default = "avg_inference_latency(ms)" ,
1522
+ )
1544
1523
parser .add_argument (
1545
1524
"--output-dir" ,
1546
1525
default = "stability_analysis_results" ,
1547
1526
help = "Directory to save analysis results (default: stability_analysis_results)" ,
1548
1527
)
1549
-
1528
+ parser .add_argument (
1529
+ "--verbose-level" ,
1530
+ type = int ,
1531
+ default = 0 ,
1532
+ choices = range (4 ),
1533
+ help = "Verbose level 0-3 (default: 0) to control analysis output detail. Higher values show more detailed results." ,
1534
+ )
1550
1535
# Parse arguments
1551
1536
args = parser .parse_args ()
1552
1537
1553
1538
# Run analysis
1554
- analyze_latency_stability (args .primary_file , args .reference_file , args .output_dir )
1539
+ analyze_latency_stability (
1540
+ args .metric ,
1541
+ args .primary_file ,
1542
+ args .reference_file ,
1543
+ args .output_dir ,
1544
+ args .verbose_level ,
1545
+ )
1555
1546
1556
1547
1557
1548
if __name__ == "__main__" :
0 commit comments