@@ -66,7 +66,11 @@ def is_matching_dataset(primary_sheet, reference_sheet):
6666
6767
6868def analyze_latency_stability ( # noqa: C901
69- primary_file , reference_file = None , output_dir = "stability_analysis_results"
69+ target_metric ,
70+ primary_file ,
71+ reference_file = None ,
72+ output_dir = "stability_analysis_results" ,
73+ verbose_level = 0 ,
7074):
7175 print (f"Analyzing latency stability from primary file: { primary_file } " )
7276 if reference_file :
@@ -99,31 +103,19 @@ def analyze_latency_stability( # noqa: C901
99103
100104 model , full_device , base_device , os_version = parse_model_device_config (config )
101105
102- # Check if required columns exist
103- required_cols = ["avg_inference_latency(ms)" , "metadata_info.timestamp" ]
104- if "trimmean_inference_latency(ms)" in df .columns :
105- trimmed_col = "trimmean_inference_latency(ms)"
106- required_cols .append (trimmed_col )
107- else :
108- trimmed_col = None
109-
110- if "TPS" in df .columns :
111- tps_col = "TPS"
112- required_cols .append (tps_col )
113- else :
114- tps_col = None
115-
116106 # Skip sheets without required columns
107+ required_cols = [target_metric , "metadata_info.timestamp" ]
117108 if not all (col in df .columns for col in required_cols ):
118109 print (f" Skipping { sheetName } : Missing required columns" )
119110 continue
120111
121112 # Convert Date to datetime
122113 df ["Date" ] = pd .to_datetime (df ["metadata_info.timestamp" ])
123114
124- # Calculate stability metrics
115+ # Calculate stability metrics along the target column in the dataset
125116 metrics = calculate_stability_metrics (
126- df , "avg_inference_latency(ms)" , trimmed_col , tps_col
117+ df ,
118+ target_metric ,
127119 )
128120
129121 primary_datasets [sheetName ] = {
@@ -161,21 +153,8 @@ def analyze_latency_stability( # noqa: C901
161153 config
162154 )
163155
164- # Check if required columns exist
165- required_cols = ["avg_inference_latency(ms)" , "metadata_info.timestamp" ]
166- if "trimmean_inference_latency(ms)" in df .columns :
167- trimmed_col = "trimmean_inference_latency(ms)"
168- required_cols .append (trimmed_col )
169- else :
170- trimmed_col = None
171-
172- if "TPS" in df .columns :
173- tps_col = "TPS"
174- required_cols .append (tps_col )
175- else :
176- tps_col = None
177-
178156 # Skip sheets without required columns
157+ required_cols = [target_metric , "metadata_info.timestamp" ]
179158 if not all (col in df .columns for col in required_cols ):
180159 print (
181160 f" Skipping reference { sheetName } : Missing required columns{ required_cols } "
@@ -187,7 +166,8 @@ def analyze_latency_stability( # noqa: C901
187166
188167 # Calculate stability metrics
189168 metrics = calculate_stability_metrics (
190- df , "avg_inference_latency(ms)" , trimmed_col , tps_col
169+ df ,
170+ target_metric ,
191171 )
192172
193173 reference_datasets [sheetName ] = {
@@ -201,30 +181,33 @@ def analyze_latency_stability( # noqa: C901
201181 }
202182
203183 # Process primary datasets
204- print_section_header ("ANALYZING PRIMARY DATASETS" )
205- for sheet , info in primary_datasets .items ():
206- # Generate dataset report
207- generate_dataset_report (
208- sheet ,
209- info ["model" ],
210- info ["full_device" ],
211- "Primary" ,
212- info ["df" ],
213- info ["metrics" ],
214- output_dir ,
215- )
184+ if verbose_level > 2 :
185+ print_section_header ("ANALYZING PRIMARY DATASETS" )
186+ for sheet , info in primary_datasets .items ():
187+ # Generate dataset report
188+ generate_dataset_report (
189+ sheet ,
190+ target_metric ,
191+ info ["model" ],
192+ info ["full_device" ],
193+ "Primary" ,
194+ info ["df" ],
195+ info ["metrics" ],
196+ output_dir ,
197+ )
216198
217- # Generate time series plot
218- if len (info ["df" ]) > 5 : # Only create plot if enough data points
219- generate_time_series_plot (sheet , info ["df" ], output_dir , "Primary" )
199+ # Generate time series plot
200+ if len (info ["df" ]) > 5 : # Only create plot if enough data points
201+ generate_time_series_plot (sheet , info ["df" ], output_dir , "Primary" )
220202
221203 # Process reference datasets if provided
222- if reference_file :
204+ if reference_file and verbose_level > 3 :
223205 print_section_header ("ANALYZING REFERENCE DATASETS" )
224206 for sheet , info in reference_datasets .items ():
225207 # Generate dataset report
226208 generate_dataset_report (
227209 sheet ,
210+ target_metric ,
228211 info ["model" ],
229212 info ["full_device" ],
230213 "Reference" ,
@@ -238,7 +221,7 @@ def analyze_latency_stability( # noqa: C901
238221 generate_time_series_plot (sheet , info ["df" ], output_dir , "Reference" )
239222
240223 # Generate comparison reports for matching datasets
241- if reference_file :
224+ if reference_file and verbose_level > 1 :
242225 print_section_header ("PRIVATE VS PUBLIC STABILITY COMPARISON" )
243226 matches_found = False
244227
@@ -270,9 +253,10 @@ def analyze_latency_stability( # noqa: C901
270253 if not matches_found :
271254 print ("No matching datasets found between primary and reference files." )
272255
273- # Generate intra-primary summary (comparing across different models/devices)
274- print_section_header ("INTRA-PRIMARY STABILITY COMPARISON" )
275- generate_intra_primary_summary (primary_datasets , output_dir )
256+ if verbose_level > 0 :
257+ # Generate intra-primary summary (comparing across different models/devices)
258+ print_section_header ("INTRA-PRIMARY STABILITY COMPARISON" )
259+ generate_intra_primary_summary (primary_datasets , output_dir )
276260
277261 # Generate summary report for all datasets
278262 print_section_header ("COMPREHENSIVE STABILITY SUMMARY" )
@@ -285,28 +269,17 @@ def analyze_latency_stability( # noqa: C901
285269
286270
287271def calculate_stability_metrics ( # noqa: C901
288- df , raw_col , trimmed_col = None , tps_col = None
272+ df ,
273+ target_metric ,
289274):
290275 """Calculate stability metrics for the given dataset"""
291276 metrics = {}
292-
293- # Extract data
294- raw_latency = df [raw_col ].values
295- if trimmed_col and trimmed_col in df .columns :
296- trimmed_latency = df [trimmed_col ].values
297- else :
298- trimmed_latency = None
299- if tps_col and tps_col in df .columns :
300- tps = df [tps_col ].values
301- else :
302- tps = None
277+ # Extract data and ingore NaN values
278+ raw_latency = df [target_metric ].dropna ().values
303279
304280 # Central tendency metrics
305281 metrics ["mean_raw_latency" ] = np .mean (raw_latency )
306282 metrics ["median_raw_latency" ] = np .median (raw_latency )
307- if trimmed_latency is not None :
308- metrics ["mean_trimmed_latency" ] = np .mean (trimmed_latency )
309- metrics ["median_trimmed_latency" ] = np .median (trimmed_latency )
310283
311284 # Dispersion metrics
312285 metrics ["std_raw_latency" ] = np .std (raw_latency , ddof = 1 )
@@ -316,20 +289,10 @@ def calculate_stability_metrics( # noqa: C901
316289 metrics ["iqr_raw_latency" ] = np .percentile (raw_latency , 75 ) - np .percentile (
317290 raw_latency , 25
318291 )
319- if trimmed_latency is not None :
320- metrics ["std_trimmed_latency" ] = np .std (trimmed_latency , ddof = 1 )
321- metrics ["cv_trimmed_latency" ] = (
322- metrics ["std_trimmed_latency" ] / metrics ["mean_trimmed_latency" ]
323- ) * 100
324- metrics ["iqr_trimmed_latency" ] = np .percentile (
325- trimmed_latency , 75
326- ) - np .percentile (trimmed_latency , 25 )
327292
328293 # Percentile metrics
329294 for p in [50 , 90 , 95 , 99 ]:
330295 metrics [f"p{ p } _raw_latency" ] = np .percentile (raw_latency , p )
331- if trimmed_latency is not None :
332- metrics [f"p{ p } _trimmed_latency" ] = np .percentile (trimmed_latency , p )
333296
334297 # Inter-jitter metrics (variability between runs)
335298 if np .min (raw_latency ) > 0 :
@@ -342,37 +305,45 @@ def calculate_stability_metrics( # noqa: C901
342305 metrics ["p99_raw_latency" ] / metrics ["p50_raw_latency" ]
343306 )
344307
345- if trimmed_latency is not None :
346- if np .min (trimmed_latency ) > 0 :
347- metrics ["max_min_range_ratio_trimmed" ] = np .max (trimmed_latency ) / np .min (
348- trimmed_latency
349- )
350- else :
351- metrics ["max_min_range_ratio_trimmed" ] = float ("inf" )
352- print (
353- "Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
308+ # Intra-jitter proxy (if both raw and trimmed latency are available)
309+ trimmed_metric_col = "trimmean_inference_latency(ms)"
310+ if (
311+ target_metric == "avg_inference_latency(ms)"
312+ and trimmed_metric_col in df .columns
313+ ):
314+ trimmed_latency = df [trimmed_metric_col ].values
315+ if trimmed_latency is not None :
316+ metrics ["mean_trimmed_latency" ] = np .mean (trimmed_latency )
317+ metrics ["median_trimmed_latency" ] = np .median (trimmed_latency )
318+ metrics ["std_trimmed_latency" ] = np .std (trimmed_latency , ddof = 1 )
319+ metrics ["cv_trimmed_latency" ] = (
320+ metrics ["std_trimmed_latency" ] / metrics ["mean_trimmed_latency" ]
321+ ) * 100
322+ metrics ["iqr_trimmed_latency" ] = np .percentile (
323+ trimmed_latency , 75
324+ ) - np .percentile (trimmed_latency , 25 )
325+ for p in [50 , 90 , 95 , 99 ]:
326+ metrics [f"p{ p } _trimmed_latency" ] = np .percentile (trimmed_latency , p )
327+ if np .min (trimmed_latency ) > 0 :
328+ metrics ["max_min_range_ratio_trimmed" ] = np .max (
329+ trimmed_latency
330+ ) / np .min (trimmed_latency )
331+ else :
332+ metrics ["max_min_range_ratio_trimmed" ] = float ("inf" )
333+ print (
334+ "Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
335+ )
336+ metrics ["p99_p50_ratio_trimmed" ] = (
337+ metrics ["p99_trimmed_latency" ] / metrics ["p50_trimmed_latency" ]
354338 )
355-
356- metrics ["p99_p50_ratio_trimmed" ] = (
357- metrics ["p99_trimmed_latency" ] / metrics ["p50_trimmed_latency" ]
358- )
359-
360- # Intra-jitter proxy (if both raw and trimmed are available)
361- if trimmed_latency is not None :
362- trimming_effect = (raw_latency - trimmed_latency ) / raw_latency
363- metrics ["mean_trimming_effect_ratio" ] = np .mean (trimming_effect )
364- metrics ["max_trimming_effect_ratio" ] = np .max (trimming_effect )
365-
366- # TPS metrics
367- if tps is not None :
368- metrics ["mean_tps" ] = np .mean (tps )
369- metrics ["std_tps" ] = np .std (tps , ddof = 1 )
370- metrics ["cv_tps" ] = (metrics ["std_tps" ] / metrics ["mean_tps" ]) * 100
339+ trimming_effect = (raw_latency - trimmed_latency ) / raw_latency
340+ metrics ["mean_trimming_effect_ratio" ] = np .mean (trimming_effect )
341+ metrics ["max_trimming_effect_ratio" ] = np .max (trimming_effect )
371342
372343 # Time-based stability (rolling window of 5 samples)
373344 if len (df ) >= 5 :
374345 df_sorted = df .sort_values ("Date" )
375- rolling_std = df_sorted [raw_col ].rolling (window = 5 ).std ()
346+ rolling_std = df_sorted [target_metric ].rolling (window = 5 ).std ()
376347 metrics ["mean_rolling_std" ] = rolling_std .mean ()
377348 metrics ["max_rolling_std" ] = rolling_std .max ()
378349
@@ -419,7 +390,7 @@ def calculate_stability_metrics( # noqa: C901
419390
420391
421392def generate_dataset_report ( # noqa: C901
422- sheet_name , model , device , dataset_type , df , metrics , output_dir
393+ sheet_name , target_column , model , device , dataset_type , df , metrics , output_dir
423394):
424395 """Generate a detailed report for a single dataset"""
425396 report_file = f"{ output_dir } /{ sheet_name } _{ dataset_type .lower ()} _report.txt"
@@ -436,7 +407,9 @@ def generate_dataset_report( # noqa: C901
436407
437408 # Dataset overview
438409 report_content .append ("Dataset Overview:" )
439- report_content .append (f" - Number of samples: { len (df )} " )
410+ report_content .append (
411+ f" - Number of samples: { len (df [target_column ].dropna ().values )} "
412+ )
440413 report_content .append (f" - Date range: { df ['Date' ].min ()} to { df ['Date' ].max ()} " )
441414 report_content .append ("" )
442415
@@ -1541,17 +1514,34 @@ def main():
15411514 help = "Path to Excel file containing reference (public) benchmark data for comparison" ,
15421515 default = None ,
15431516 )
1517+ parser .add_argument (
1518+ "--metric" ,
1519+ help = "Target metric to analyze (default: avg_inference_latency(ms)). Examples: avg_inference_latency(ms), token_per_sec" ,
1520+ default = "avg_inference_latency(ms)" ,
1521+ )
15441522 parser .add_argument (
15451523 "--output-dir" ,
15461524 default = "stability_analysis_results" ,
15471525 help = "Directory to save analysis results (default: stability_analysis_results)" ,
15481526 )
1549-
1527+ parser .add_argument (
1528+ "--verbose-level" ,
1529+ type = int ,
1530+ default = 0 ,
1531+ choices = range (4 ),
1532+ help = "Verbose level 0-3 (default: 0) to control analysis output detail. Higher values show more detailed results." ,
1533+ )
15501534 # Parse arguments
15511535 args = parser .parse_args ()
15521536
15531537 # Run analysis
1554- analyze_latency_stability (args .primary_file , args .reference_file , args .output_dir )
1538+ analyze_latency_stability (
1539+ args .metric ,
1540+ args .primary_file ,
1541+ args .reference_file ,
1542+ args .output_dir ,
1543+ args .verbose_level ,
1544+ )
15551545
15561546
15571547if __name__ == "__main__" :
0 commit comments