@@ -66,11 +66,16 @@ def is_matching_dataset(primary_sheet, reference_sheet):
6666
6767
6868def analyze_latency_stability ( # noqa: C901
69- primary_file , reference_file = None , output_dir = "stability_analysis_results"
69+ target_metric ,
70+ primary_file ,
71+ reference_file = None ,
72+ output_dir = "stability_analysis_results" ,
73+ verbose_level = 0 ,
7074):
71- print (f"Analyzing latency stability from primary file: { primary_file } " )
75+ print_section_header (f"Analyzing Stability Against Metric '{ target_metric } '" )
76+ print (f"Primary dataset: { primary_file } " )
7277 if reference_file :
73- print (f"Using reference file for comparison: { reference_file } " )
78+ print (f"Reference dataset for comparison: { reference_file } " )
7479
7580 # Create output directory if it doesn't exist
7681 if not os .path .exists (output_dir ):
@@ -99,31 +104,19 @@ def analyze_latency_stability( # noqa: C901
99104
100105 model , full_device , base_device , os_version = parse_model_device_config (config )
101106
102- # Check if required columns exist
103- required_cols = ["avg_inference_latency(ms)" , "metadata_info.timestamp" ]
104- if "trimmean_inference_latency(ms)" in df .columns :
105- trimmed_col = "trimmean_inference_latency(ms)"
106- required_cols .append (trimmed_col )
107- else :
108- trimmed_col = None
109-
110- if "TPS" in df .columns :
111- tps_col = "TPS"
112- required_cols .append (tps_col )
113- else :
114- tps_col = None
115-
116107 # Skip sheets without required columns
108+ required_cols = [target_metric , "metadata_info.timestamp" ]
117109 if not all (col in df .columns for col in required_cols ):
118110 print (f" Skipping { sheetName } : Missing required columns" )
119111 continue
120112
121113 # Convert Date to datetime
122114 df ["Date" ] = pd .to_datetime (df ["metadata_info.timestamp" ])
123115
124- # Calculate stability metrics
116+ # Calculate stability metrics along the target column in the dataset
125117 metrics = calculate_stability_metrics (
126- df , "avg_inference_latency(ms)" , trimmed_col , tps_col
118+ df ,
119+ target_metric ,
127120 )
128121
129122 primary_datasets [sheetName ] = {
@@ -161,21 +154,8 @@ def analyze_latency_stability( # noqa: C901
161154 config
162155 )
163156
164- # Check if required columns exist
165- required_cols = ["avg_inference_latency(ms)" , "metadata_info.timestamp" ]
166- if "trimmean_inference_latency(ms)" in df .columns :
167- trimmed_col = "trimmean_inference_latency(ms)"
168- required_cols .append (trimmed_col )
169- else :
170- trimmed_col = None
171-
172- if "TPS" in df .columns :
173- tps_col = "TPS"
174- required_cols .append (tps_col )
175- else :
176- tps_col = None
177-
178157 # Skip sheets without required columns
158+ required_cols = [target_metric , "metadata_info.timestamp" ]
179159 if not all (col in df .columns for col in required_cols ):
180160 print (
181161 f" Skipping reference { sheetName } : Missing required columns{ required_cols } "
@@ -187,7 +167,8 @@ def analyze_latency_stability( # noqa: C901
187167
188168 # Calculate stability metrics
189169 metrics = calculate_stability_metrics (
190- df , "avg_inference_latency(ms)" , trimmed_col , tps_col
170+ df ,
171+ target_metric ,
191172 )
192173
193174 reference_datasets [sheetName ] = {
@@ -201,30 +182,33 @@ def analyze_latency_stability( # noqa: C901
201182 }
202183
203184 # Process primary datasets
204- print_section_header ("ANALYZING PRIMARY DATASETS" )
205- for sheet , info in primary_datasets .items ():
206- # Generate dataset report
207- generate_dataset_report (
208- sheet ,
209- info ["model" ],
210- info ["full_device" ],
211- "Primary" ,
212- info ["df" ],
213- info ["metrics" ],
214- output_dir ,
215- )
185+ if verbose_level > 2 :
186+ print_section_header ("ANALYZING PRIMARY DATASETS" )
187+ for sheet , info in primary_datasets .items ():
188+ # Generate dataset report
189+ generate_dataset_report (
190+ sheet ,
191+ target_metric ,
192+ info ["model" ],
193+ info ["full_device" ],
194+ "Primary" ,
195+ info ["df" ],
196+ info ["metrics" ],
197+ output_dir ,
198+ )
216199
217- # Generate time series plot
218- if len (info ["df" ]) > 5 : # Only create plot if enough data points
219- generate_time_series_plot (sheet , info ["df" ], output_dir , "Primary" )
200+ # Generate time series plot
201+ if len (info ["df" ]) > 5 : # Only create plot if enough data points
202+ generate_time_series_plot (sheet , info ["df" ], output_dir , "Primary" )
220203
221204 # Process reference datasets if provided
222- if reference_file :
205+ if reference_file and verbose_level > 3 :
223206 print_section_header ("ANALYZING REFERENCE DATASETS" )
224207 for sheet , info in reference_datasets .items ():
225208 # Generate dataset report
226209 generate_dataset_report (
227210 sheet ,
211+ target_metric ,
228212 info ["model" ],
229213 info ["full_device" ],
230214 "Reference" ,
@@ -238,7 +222,7 @@ def analyze_latency_stability( # noqa: C901
238222 generate_time_series_plot (sheet , info ["df" ], output_dir , "Reference" )
239223
240224 # Generate comparison reports for matching datasets
241- if reference_file :
225+ if reference_file and verbose_level > 1 :
242226 print_section_header ("PRIVATE VS PUBLIC STABILITY COMPARISON" )
243227 matches_found = False
244228
@@ -270,9 +254,10 @@ def analyze_latency_stability( # noqa: C901
270254 if not matches_found :
271255 print ("No matching datasets found between primary and reference files." )
272256
273- # Generate intra-primary summary (comparing across different models/devices)
274- print_section_header ("INTRA-PRIMARY STABILITY COMPARISON" )
275- generate_intra_primary_summary (primary_datasets , output_dir )
257+ if verbose_level > 0 :
258+ # Generate intra-primary summary (comparing across different models/devices)
259+ print_section_header ("INTRA-PRIMARY STABILITY COMPARISON" )
260+ generate_intra_primary_summary (primary_datasets , output_dir )
276261
277262 # Generate summary report for all datasets
278263 print_section_header ("COMPREHENSIVE STABILITY SUMMARY" )
@@ -285,28 +270,17 @@ def analyze_latency_stability( # noqa: C901
285270
286271
287272def calculate_stability_metrics ( # noqa: C901
288- df , raw_col , trimmed_col = None , tps_col = None
273+ df ,
274+ target_metric ,
289275):
290276 """Calculate stability metrics for the given dataset"""
291277 metrics = {}
292-
293- # Extract data
294- raw_latency = df [raw_col ].values
295- if trimmed_col and trimmed_col in df .columns :
296- trimmed_latency = df [trimmed_col ].values
297- else :
298- trimmed_latency = None
299- if tps_col and tps_col in df .columns :
300- tps = df [tps_col ].values
301- else :
302- tps = None
278+ # Extract data and ingore NaN values
279+ raw_latency = df [target_metric ].dropna ().values
303280
304281 # Central tendency metrics
305282 metrics ["mean_raw_latency" ] = np .mean (raw_latency )
306283 metrics ["median_raw_latency" ] = np .median (raw_latency )
307- if trimmed_latency is not None :
308- metrics ["mean_trimmed_latency" ] = np .mean (trimmed_latency )
309- metrics ["median_trimmed_latency" ] = np .median (trimmed_latency )
310284
311285 # Dispersion metrics
312286 metrics ["std_raw_latency" ] = np .std (raw_latency , ddof = 1 )
@@ -316,20 +290,10 @@ def calculate_stability_metrics( # noqa: C901
316290 metrics ["iqr_raw_latency" ] = np .percentile (raw_latency , 75 ) - np .percentile (
317291 raw_latency , 25
318292 )
319- if trimmed_latency is not None :
320- metrics ["std_trimmed_latency" ] = np .std (trimmed_latency , ddof = 1 )
321- metrics ["cv_trimmed_latency" ] = (
322- metrics ["std_trimmed_latency" ] / metrics ["mean_trimmed_latency" ]
323- ) * 100
324- metrics ["iqr_trimmed_latency" ] = np .percentile (
325- trimmed_latency , 75
326- ) - np .percentile (trimmed_latency , 25 )
327293
328294 # Percentile metrics
329295 for p in [50 , 90 , 95 , 99 ]:
330296 metrics [f"p{ p } _raw_latency" ] = np .percentile (raw_latency , p )
331- if trimmed_latency is not None :
332- metrics [f"p{ p } _trimmed_latency" ] = np .percentile (trimmed_latency , p )
333297
334298 # Inter-jitter metrics (variability between runs)
335299 if np .min (raw_latency ) > 0 :
@@ -342,37 +306,45 @@ def calculate_stability_metrics( # noqa: C901
342306 metrics ["p99_raw_latency" ] / metrics ["p50_raw_latency" ]
343307 )
344308
345- if trimmed_latency is not None :
346- if np .min (trimmed_latency ) > 0 :
347- metrics ["max_min_range_ratio_trimmed" ] = np .max (trimmed_latency ) / np .min (
348- trimmed_latency
349- )
350- else :
351- metrics ["max_min_range_ratio_trimmed" ] = float ("inf" )
352- print (
353- "Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
309+ # Intra-jitter proxy (if both raw and trimmed latency are available)
310+ trimmed_metric_col = "trimmean_inference_latency(ms)"
311+ if (
312+ target_metric == "avg_inference_latency(ms)"
313+ and trimmed_metric_col in df .columns
314+ ):
315+ trimmed_latency = df [trimmed_metric_col ].values
316+ if trimmed_latency is not None :
317+ metrics ["mean_trimmed_latency" ] = np .mean (trimmed_latency )
318+ metrics ["median_trimmed_latency" ] = np .median (trimmed_latency )
319+ metrics ["std_trimmed_latency" ] = np .std (trimmed_latency , ddof = 1 )
320+ metrics ["cv_trimmed_latency" ] = (
321+ metrics ["std_trimmed_latency" ] / metrics ["mean_trimmed_latency" ]
322+ ) * 100
323+ metrics ["iqr_trimmed_latency" ] = np .percentile (
324+ trimmed_latency , 75
325+ ) - np .percentile (trimmed_latency , 25 )
326+ for p in [50 , 90 , 95 , 99 ]:
327+ metrics [f"p{ p } _trimmed_latency" ] = np .percentile (trimmed_latency , p )
328+ if np .min (trimmed_latency ) > 0 :
329+ metrics ["max_min_range_ratio_trimmed" ] = np .max (
330+ trimmed_latency
331+ ) / np .min (trimmed_latency )
332+ else :
333+ metrics ["max_min_range_ratio_trimmed" ] = float ("inf" )
334+ print (
335+ "Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
336+ )
337+ metrics ["p99_p50_ratio_trimmed" ] = (
338+ metrics ["p99_trimmed_latency" ] / metrics ["p50_trimmed_latency" ]
354339 )
355-
356- metrics ["p99_p50_ratio_trimmed" ] = (
357- metrics ["p99_trimmed_latency" ] / metrics ["p50_trimmed_latency" ]
358- )
359-
360- # Intra-jitter proxy (if both raw and trimmed are available)
361- if trimmed_latency is not None :
362- trimming_effect = (raw_latency - trimmed_latency ) / raw_latency
363- metrics ["mean_trimming_effect_ratio" ] = np .mean (trimming_effect )
364- metrics ["max_trimming_effect_ratio" ] = np .max (trimming_effect )
365-
366- # TPS metrics
367- if tps is not None :
368- metrics ["mean_tps" ] = np .mean (tps )
369- metrics ["std_tps" ] = np .std (tps , ddof = 1 )
370- metrics ["cv_tps" ] = (metrics ["std_tps" ] / metrics ["mean_tps" ]) * 100
340+ trimming_effect = (raw_latency - trimmed_latency ) / raw_latency
341+ metrics ["mean_trimming_effect_ratio" ] = np .mean (trimming_effect )
342+ metrics ["max_trimming_effect_ratio" ] = np .max (trimming_effect )
371343
372344 # Time-based stability (rolling window of 5 samples)
373345 if len (df ) >= 5 :
374346 df_sorted = df .sort_values ("Date" )
375- rolling_std = df_sorted [raw_col ].rolling (window = 5 ).std ()
347+ rolling_std = df_sorted [target_metric ].rolling (window = 5 ).std ()
376348 metrics ["mean_rolling_std" ] = rolling_std .mean ()
377349 metrics ["max_rolling_std" ] = rolling_std .max ()
378350
@@ -419,7 +391,7 @@ def calculate_stability_metrics( # noqa: C901
419391
420392
421393def generate_dataset_report ( # noqa: C901
422- sheet_name , model , device , dataset_type , df , metrics , output_dir
394+ sheet_name , target_column , model , device , dataset_type , df , metrics , output_dir
423395):
424396 """Generate a detailed report for a single dataset"""
425397 report_file = f"{ output_dir } /{ sheet_name } _{ dataset_type .lower ()} _report.txt"
@@ -436,7 +408,9 @@ def generate_dataset_report( # noqa: C901
436408
437409 # Dataset overview
438410 report_content .append ("Dataset Overview:" )
439- report_content .append (f" - Number of samples: { len (df )} " )
411+ report_content .append (
412+ f" - Number of samples: { len (df [target_column ].dropna ().values )} "
413+ )
440414 report_content .append (f" - Date range: { df ['Date' ].min ()} to { df ['Date' ].max ()} " )
441415 report_content .append ("" )
442416
@@ -719,12 +693,12 @@ def generate_comparison_report( # noqa: C901
719693
720694 # Add key metrics to the table
721695 metrics_to_compare = [
722- ("Mean Latency (ms) " , "mean_raw_latency" , "ms " ),
723- ("Median Latency (ms) " , "median_raw_latency" , "ms " ),
724- ("Standard Deviation (ms) " , "std_raw_latency" , "ms " ),
696+ ("Mean Value " , "mean_raw_latency" , "" ),
697+ ("Median Value " , "median_raw_latency" , "" ),
698+ ("Standard Deviation" , "std_raw_latency" , "" ),
725699 ("CV (%)" , "cv_raw_latency" , "%" ),
726- ("IQR (ms) " , "iqr_raw_latency" , "ms " ),
727- ("P99 (ms) " , "p99_raw_latency" , "ms " ),
700+ ("IQR" , "iqr_raw_latency" , "" ),
701+ ("P99" , "p99_raw_latency" , "" ),
728702 ("Max/Min Ratio" , "max_min_range_ratio_raw" , "" ),
729703 ("P99/P50 Ratio" , "p99_p50_ratio_raw" , "" ),
730704 ("Stability Score" , "stability_score" , "" ),
@@ -1056,7 +1030,7 @@ def generate_intra_primary_summary(primary_datasets, output_dir): # noqa: C901
10561030 "Sheet" : sheet_name ,
10571031 "Model" : info ["model" ],
10581032 "Device" : info ["full_device" ],
1059- "Mean Latency (ms) " : info ["metrics" ]["mean_raw_latency" ],
1033+ "Mean Value " : info ["metrics" ]["mean_raw_latency" ],
10601034 "CV (%)" : info ["metrics" ]["cv_raw_latency" ],
10611035 "Stability Score" : info ["metrics" ]["stability_score" ],
10621036 "Stability Rating" : info ["metrics" ]["stability_rating" ],
@@ -1293,7 +1267,7 @@ def generate_summary_report( # noqa: C901
12931267 "Dataset" : sheet_name ,
12941268 "Model" : model ,
12951269 "Device" : device_display ,
1296- "Mean Latency (ms) " : info ["metrics" ]["mean_raw_latency" ],
1270+ "Mean Value " : info ["metrics" ]["mean_raw_latency" ],
12971271 "CV (%)" : info ["metrics" ]["cv_raw_latency" ],
12981272 "Stability Score" : info ["metrics" ]["stability_score" ],
12991273 "Stability Rating" : info ["metrics" ]["stability_rating" ],
@@ -1330,7 +1304,7 @@ def generate_summary_report( # noqa: C901
13301304 "Dataset" : sheet_name ,
13311305 "Model" : model ,
13321306 "Device" : device_display ,
1333- "Mean Latency (ms) " : info ["metrics" ]["mean_raw_latency" ],
1307+ "Mean Value " : info ["metrics" ]["mean_raw_latency" ],
13341308 "CV (%)" : info ["metrics" ]["cv_raw_latency" ],
13351309 "Stability Score" : info ["metrics" ]["stability_score" ],
13361310 "Stability Rating" : info ["metrics" ]["stability_rating" ],
@@ -1541,17 +1515,34 @@ def main():
15411515 help = "Path to Excel file containing reference (public) benchmark data for comparison" ,
15421516 default = None ,
15431517 )
1518+ parser .add_argument (
1519+ "--metric" ,
1520+ help = "Target metric to analyze (default: avg_inference_latency(ms)). Examples: avg_inference_latency(ms), token_per_sec" ,
1521+ default = "avg_inference_latency(ms)" ,
1522+ )
15441523 parser .add_argument (
15451524 "--output-dir" ,
15461525 default = "stability_analysis_results" ,
15471526 help = "Directory to save analysis results (default: stability_analysis_results)" ,
15481527 )
1549-
1528+ parser .add_argument (
1529+ "--verbose-level" ,
1530+ type = int ,
1531+ default = 0 ,
1532+ choices = range (4 ),
1533+ help = "Verbose level 0-3 (default: 0) to control analysis output detail. Higher values show more detailed results." ,
1534+ )
15501535 # Parse arguments
15511536 args = parser .parse_args ()
15521537
15531538 # Run analysis
1554- analyze_latency_stability (args .primary_file , args .reference_file , args .output_dir )
1539+ analyze_latency_stability (
1540+ args .metric ,
1541+ args .primary_file ,
1542+ args .reference_file ,
1543+ args .output_dir ,
1544+ args .verbose_level ,
1545+ )
15551546
15561547
15571548if __name__ == "__main__" :
0 commit comments