diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 1e2a5f4b..87834e74 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -817,7 +817,7 @@ def from_stats( ], iter_counts=[req.output_tokens for req in total_with_output_first], first_iter_counts=[ - req.prompt_tokens for req in total_with_output_first + req.prompt_tokens + 1 for req in total_with_output_first ], ), ), diff --git a/src/guidellm/objects/statistics.py b/src/guidellm/objects/statistics.py index 7831b2cf..0299b2c3 100644 --- a/src/guidellm/objects/statistics.py +++ b/src/guidellm/objects/statistics.py @@ -219,7 +219,7 @@ def from_values( ) @staticmethod - def from_request_times( + def from_request_times( # noqa: C901 requests: list[tuple[float, float]], distribution_type: Literal["concurrency", "rate"], include_cdf: bool = False, @@ -248,13 +248,7 @@ def from_request_times( time_deltas[start] += 1 time_deltas[end] -= 1 - # convert to the events over time measuring concurrency changes - events = [] - active = 0 - - for time, delta in sorted(time_deltas.items()): - active += delta - events.append((time, active)) + events = list(time_deltas.items()) elif distribution_type == "rate": # convert to events for when requests finished global_start = min(start for start, _ in requests) if requests else 0 @@ -281,6 +275,16 @@ def from_request_times( else: flattened_events.append((time, val)) + if distribution_type == "concurrency": + # convert to the events over time measuring concurrency changes + events_over_time: list[tuple[float, float]] = [] + active = 0 + for time, delta in flattened_events: + active += delta # type: ignore [assignment] + events_over_time.append((time, active)) + + flattened_events = events_over_time + # convert to value distribution function distribution: dict[float, float] = defaultdict(float) diff --git a/tests/unit/objects/test_statistics.py b/tests/unit/objects/test_statistics.py index fa8cccd0..ede77175 100644 --- a/tests/unit/objects/test_statistics.py +++ b/tests/unit/objects/test_statistics.py @@ -704,3 +704,82 @@ def test_time_running_stats_update(): assert time_running_stats.rate_ms == pytest.approx( 3000 / (time.time() - time_running_stats.start_time), abs=0.1 ) + + +@pytest.mark.regression +def test_distribution_summary_concurrency_double_counting_regression(): + """Specific regression test for the double-counting bug in concurrency calculation. + + Before the fix, when events were merged due to epsilon, the deltas were summed + but then the active count wasn't properly accumulated, causing incorrect results. + + ### WRITTEN BY AI ### + """ + epsilon = 1e-6 + + # Create a scenario where multiple requests start at exactly the same time + # This should result in events being merged, testing the accumulation logic + same_start_time = 1.0 + requests = [ + (same_start_time, 3.0), + (same_start_time, 4.0), + (same_start_time, 5.0), + (same_start_time + epsilon / 3, 6.0), # Very close start (within epsilon) + ] + + distribution_summary = DistributionSummary.from_request_times( + requests, distribution_type="concurrency", epsilon=epsilon + ) + + # All requests start at the same time (or within epsilon), so they should + # all be considered concurrent from the start + # Expected timeline: + # - t=1.0-3.0: 4 concurrent requests + # - t=3.0-4.0: 3 concurrent requests + # - t=4.0-5.0: 2 concurrent requests + # - t=5.0-6.0: 1 concurrent request + + assert distribution_summary.max == 4.0 # All 4 requests concurrent at start + assert distribution_summary.min == 1.0 # 1 request still running at the end + + +@pytest.mark.sanity +def test_distribution_summary_concurrency_epsilon_edge_case(): + """Test the exact epsilon boundary condition. + + ### WRITTEN BY AI ### + """ + epsilon = 1e-6 + + # Test requests that are exactly epsilon apart - should be merged + requests_exactly_epsilon = [ + (1.0, 2.0), + (1.0 + epsilon, 2.5), # Exactly epsilon apart + (2.0, 2.5), # Another close request + ] + + dist_epsilon = DistributionSummary.from_request_times( + requests_exactly_epsilon, distribution_type="concurrency", epsilon=epsilon + ) + + # Should be treated as concurrent (merged events) + assert dist_epsilon.max == 2.0 + assert dist_epsilon.min == 2.0 + + # Test requests that are just over epsilon apart - should NOT be merged + requests_over_epsilon = [ + (1.0, 2.0), + (1.0 + epsilon * 1.1, 2.5), # Just over epsilon apart + (2.0, 2.5), # Another close request + ] + + dist_over_epsilon = DistributionSummary.from_request_times( + requests_over_epsilon, distribution_type="concurrency", epsilon=epsilon + ) + + # These should be treated separately, so max concurrency depends on overlap + # At t=1.0 to 1.0+epsilon*1.1: 1 concurrent + # At t=1.0+epsilon*1.1 to 2.0: 2 concurrent + # At t=2.0 to 2.5: 1 concurrent + assert dist_over_epsilon.max == 2.0 + assert dist_over_epsilon.min == 1.0