@@ -1105,23 +1105,123 @@ def _extract_benchmark_metrics(self, raw_data: Dict[str, Any]) -> BenchmarkMetri
11051105
11061106 @staticmethod
11071107 def _extract_vllm_latency_metrics (data : Dict [str , Any ]) -> BenchmarkMetrics :
1108- """Extract metrics from vLLM latency benchmark results"""
1108+ """Extract metrics from vLLM latency benchmark results.
1109+
1110+ This method processes vLLM latency benchmark data and calculates standardized
1111+ performance metrics. It handles the important distinction between per-request
1112+ completion rate and system-level throughput for batch processing scenarios.
1113+
1114+ **Key Metric Definitions:**
1115+
1116+ - **Per-Request Completion Rate**: 1 / avg_latency (requests/second/experiment)
1117+ - Measures how frequently individual requests complete
1118+ - Industry standard for latency benchmarks
1119+ - Lower values for larger batch sizes due to queueing delays
1120+
1121+ - **Batch-Level Throughput**: batch_size / avg_latency (theoretical max)
1122+ - Measures aggregate system processing capacity
1123+ - Not calculated here as batch_size is not available in this context
1124+ - Calculated separately at the experiment level
1125+
1126+ **Why Per-Request Rates Appear "Low" for Large Batches:**
1127+
1128+ For batch sizes >1, individual requests experience queueing delays:
1129+ - Batch size 1: Request processes immediately → high completion rate
1130+ - Batch size 32: Each request waits for 31 others → low completion rate
1131+ - This reflects real-world user experience in interactive systems
1132+
1133+ Args:
1134+ data (Dict[str, Any]): Raw vLLM benchmark results containing:
1135+ - avg_latency: Mean response time across all requests (seconds)
1136+ - latencies: List of individual request latencies (optional)
1137+ - percentiles: Dictionary of latency percentiles (optional)
1138+
1139+ Returns:
1140+ BenchmarkMetrics: Standardized metrics object with:
1141+ - avg_latency: Average request latency (seconds)
1142+ - throughput: Per-request completion rate (requests/second)
1143+ - percentile metrics: P50, P90, P95, P99 latencies
1144+ - statistical measures: Standard deviation, iteration count
1145+
1146+ Raises:
1147+ KeyError: If required 'avg_latency' field is missing from data
1148+ ValueError: If avg_latency is zero or negative
1149+
1150+ Example:
1151+ ```
1152+ # Sample vLLM latency data
1153+ vllm_data = {
1154+ "avg_latency": 2.5, # seconds per request
1155+ "latencies": [2.3, 2.5, 2.7, 2.4, 2.6],
1156+ "percentiles": {"50": 2.5, "90": 2.7, "95": 2.8, "99": 2.9}
1157+ }
1158+
1159+ metrics = BenchmarkAnalyzer._extract_vllm_latency_metrics(vllm_data)
1160+
1161+ # Results interpretation:
1162+ print(f"Request completion rate: {metrics.throughput:.3f} req/s")
1163+ # Output: "Request completion rate: 0.400 req/s"
1164+
1165+ # For batch size 8, system throughput would be:
1166+ # system_throughput = 8 * metrics.throughput = 3.200 req/s
1167+ # (calculated at experiment level with batch size context)
1168+ ```
1169+
1170+ Note:
1171+ This method focuses on per-request performance metrics consistent with
1172+ latency benchmarking standards. System-level throughput calculations
1173+ requiring batch size context are handled in the experiment analysis layer.
1174+ """
1175+
1176+ # Validate required fields
1177+ avg_latency = data .get ("avg_latency" )
1178+ if avg_latency is None :
1179+ raise KeyError ("Required field 'avg_latency' missing from benchmark data" )
1180+ if avg_latency <= 0 :
1181+ raise ValueError (f"Invalid avg_latency value: { avg_latency } . Must be positive." )
1182+
1183+ # Extract optional data with robust handling
11091184 latencies = data .get ("latencies" , [])
11101185 percentiles = data .get ("percentiles" , {})
11111186
1112- # Calculate additional statistics if not provided
1113- import statistics
1187+ # Calculate statistical measures
1188+ latency_std = 0.0
1189+ total_iterations = len (latencies )
1190+
1191+ if len (latencies ) > 1 :
1192+ import statistics
1193+
1194+ try :
1195+ latency_std = statistics .stdev (latencies )
1196+ except statistics .StatisticsError :
1197+ logger .warning ("Could not calculate latency standard deviation" )
1198+ latency_std = 0.0
1199+
1200+ # Extract percentiles with flexible key formats
1201+ # vLLM may use string keys ("50") or integer keys (50)
1202+ def get_percentile (p : int ) -> float :
1203+ """Extract percentile with flexible key handling."""
1204+ return float (percentiles .get (str (p ), percentiles .get (p , 0.0 )))
1205+
1206+ # Calculate per-request completion rate (industry standard for latency benchmarks)
1207+ per_request_completion_rate = 1.0 / avg_latency
11141208
11151209 return BenchmarkMetrics (
1116- avg_latency = data .get ("avg_latency" , 0.0 ),
1117- latency_std = statistics .stdev (latencies ) if len (latencies ) > 1 else 0.0 ,
1118- p50_latency = percentiles .get ("50" , percentiles .get (50 , 0.0 )),
1119- p90_latency = percentiles .get ("90" , percentiles .get (90 , 0.0 )),
1120- p95_latency = percentiles .get ("95" , percentiles .get (95 , 0.0 )),
1121- p99_latency = percentiles .get ("99" , percentiles .get (99 , 0.0 )),
1122- throughput = 1.0 / data .get ("avg_latency" , 1.0 ), # Requests per second
1123- tokens_per_second = 0.0 , # Not available in latency data
1124- total_iterations = len (latencies ),
1210+ # Core latency metrics
1211+ avg_latency = avg_latency ,
1212+ latency_std = latency_std ,
1213+ # Percentile latencies (in seconds)
1214+ p50_latency = get_percentile (50 ),
1215+ p90_latency = get_percentile (90 ),
1216+ p95_latency = get_percentile (95 ),
1217+ p99_latency = get_percentile (99 ),
1218+ # Per-request completion rate (requests/second per experiment)
1219+ # Note: This is NOT system-level throughput for batch processing
1220+ throughput = per_request_completion_rate ,
1221+ # Token-level metrics (not available in latency-only benchmarks)
1222+ tokens_per_second = 0.0 ,
1223+ # Experimental metadata
1224+ total_iterations = total_iterations if total_iterations > 0 else 1 ,
11251225 )
11261226
11271227 @staticmethod
0 commit comments