|
37 | 37 | "BenchmarkArgs", |
38 | 38 | "BenchmarkRunStats", |
39 | 39 | "Benchmark", |
| 40 | + "BenchmarkMetrics", |
40 | 41 | "GenerativeTextResponseStats", |
41 | 42 | "GenerativeTextErrorStats", |
| 43 | + "GenerativeMetrics", |
42 | 44 | "GenerativeBenchmark", |
43 | 45 | ] |
44 | 46 |
|
@@ -234,6 +236,19 @@ def total(self) -> int: |
234 | 236 | return self.total_successful + self.total_incomplete + self.total_errored |
235 | 237 |
|
236 | 238 |
|
| 239 | +class BenchmarkMetrics(StandardBaseModel): |
| 240 | + """ |
| 241 | + A serializable model representing the metrics for a benchmark run. |
| 242 | + """ |
| 243 | + |
| 244 | + request_per_second: StatusDistributionSummary = Field( |
| 245 | + description="The distribution of requests per second for the benchmark.", |
| 246 | + ) |
| 247 | + request_concurrency: StatusDistributionSummary = Field( |
| 248 | + description="The distribution of requests concurrency for the benchmark.", |
| 249 | + ) |
| 250 | + |
| 251 | + |
237 | 252 | class Benchmark(StandardBaseModel): |
238 | 253 | """ |
239 | 254 | The base serializable model representing a benchmark run and its results. |
@@ -291,11 +306,11 @@ class Benchmark(StandardBaseModel): |
291 | 306 | ) |
292 | 307 | ) |
293 | 308 |
|
294 | | - requests_per_second: StatusDistributionSummary = Field( |
295 | | - description="The distribution of requests per second for the benchmark.", |
296 | | - ) |
297 | | - requests_concurrency: StatusDistributionSummary = Field( |
298 | | - description="The distribution of requests concurrency for the benchmark.", |
| 309 | + metrics: BenchmarkMetrics = Field( |
| 310 | + description=( |
| 311 | + "The metrics for the benchmark run represented as a distribution of " |
| 312 | + "various per-request statistics." |
| 313 | + ), |
299 | 314 | ) |
300 | 315 |
|
301 | 316 |
|
@@ -506,6 +521,59 @@ def output_tokens_per_second(self) -> Optional[float]: # type: ignore[override] |
506 | 521 | return super().output_tokens_per_second |
507 | 522 |
|
508 | 523 |
|
| 524 | +class GenerativeMetrics(BenchmarkMetrics): |
| 525 | + """ |
| 526 | + A serializable model representing the metrics for a generative benchmark run. |
| 527 | + """ |
| 528 | + |
| 529 | + request_latency: StatusDistributionSummary = Field( |
| 530 | + description="The distribution of latencies for the completed requests.", |
| 531 | + ) |
| 532 | + prompt_token_count: StatusDistributionSummary = Field( |
| 533 | + description=( |
| 534 | + "The distribution of token counts in the prompts for completed, " |
| 535 | + "errored, and all requests." |
| 536 | + ) |
| 537 | + ) |
| 538 | + output_token_count: StatusDistributionSummary = Field( |
| 539 | + description=( |
| 540 | + "The distribution of token counts in the outputs for completed, " |
| 541 | + "errored, and all requests." |
| 542 | + ) |
| 543 | + ) |
| 544 | + time_to_first_token_ms: StatusDistributionSummary = Field( |
| 545 | + description=( |
| 546 | + "The distribution of latencies to receiving the first token in " |
| 547 | + "milliseconds for completed, errored, and all requests." |
| 548 | + ), |
| 549 | + ) |
| 550 | + time_per_output_token_ms: StatusDistributionSummary = Field( |
| 551 | + description=( |
| 552 | + "The distribution of latencies per output token in milliseconds for " |
| 553 | + "completed, errored, and all requests. " |
| 554 | + "This includes the time to generate the first token and all other tokens." |
| 555 | + ), |
| 556 | + ) |
| 557 | + inter_token_latency_ms: StatusDistributionSummary = Field( |
| 558 | + description=( |
| 559 | + "The distribution of latencies between tokens in milliseconds for " |
| 560 | + "completed, errored, and all requests." |
| 561 | + ), |
| 562 | + ) |
| 563 | + output_tokens_per_second: StatusDistributionSummary = Field( |
| 564 | + description=( |
| 565 | + "The distribution of output tokens per second for completed, " |
| 566 | + "errored, and all requests." |
| 567 | + ), |
| 568 | + ) |
| 569 | + tokens_per_second: StatusDistributionSummary = Field( |
| 570 | + description=( |
| 571 | + "The distribution of tokens per second, including prompt and output tokens " |
| 572 | + "for completed, errored, and all requests." |
| 573 | + ), |
| 574 | + ) |
| 575 | + |
| 576 | + |
509 | 577 | class GenerativeBenchmark(Benchmark): |
510 | 578 | """ |
511 | 579 | A serializable model representing a benchmark run and its results for generative |
@@ -568,51 +636,10 @@ class GenerativeBenchmark(Benchmark): |
568 | 636 | end_time: float = Field( |
569 | 637 | description="The end time of the last request for the benchmark.", |
570 | 638 | ) |
571 | | - |
572 | | - request_latency: StatusDistributionSummary = Field( |
573 | | - description="The distribution of latencies for the completed requests.", |
574 | | - ) |
575 | | - prompt_token_count: StatusDistributionSummary = Field( |
576 | | - description=( |
577 | | - "The distribution of token counts in the prompts for completed, " |
578 | | - "errored, and all requests." |
579 | | - ) |
580 | | - ) |
581 | | - output_token_count: StatusDistributionSummary = Field( |
582 | | - description=( |
583 | | - "The distribution of token counts in the outputs for completed, " |
584 | | - "errored, and all requests." |
585 | | - ) |
586 | | - ) |
587 | | - time_to_first_token_ms: StatusDistributionSummary = Field( |
| 639 | + metrics: GenerativeMetrics = Field( |
588 | 640 | description=( |
589 | | - "The distribution of latencies to receiving the first token in " |
590 | | - "milliseconds for completed, errored, and all requests." |
591 | | - ), |
592 | | - ) |
593 | | - time_per_output_token_ms: StatusDistributionSummary = Field( |
594 | | - description=( |
595 | | - "The distribution of latencies per output token in milliseconds for " |
596 | | - "completed, errored, and all requests. " |
597 | | - "This includes the time to generate the first token and all other tokens." |
598 | | - ), |
599 | | - ) |
600 | | - inter_token_latency_ms: StatusDistributionSummary = Field( |
601 | | - description=( |
602 | | - "The distribution of latencies between tokens in milliseconds for " |
603 | | - "completed, errored, and all requests." |
604 | | - ), |
605 | | - ) |
606 | | - output_tokens_per_second: StatusDistributionSummary = Field( |
607 | | - description=( |
608 | | - "The distribution of output tokens per second for completed, " |
609 | | - "errored, and all requests." |
610 | | - ), |
611 | | - ) |
612 | | - tokens_per_second: StatusDistributionSummary = Field( |
613 | | - description=( |
614 | | - "The distribution of tokens per second, including prompt and output tokens " |
615 | | - "for completed, errored, and all requests." |
| 641 | + "The metrics for the benchmark run represented as a distribution of " |
| 642 | + "various per-request statistics." |
616 | 643 | ), |
617 | 644 | ) |
618 | 645 |
|
@@ -793,74 +820,76 @@ def from_stats( |
793 | 820 | errored_requests=errored, |
794 | 821 | start_time=start_time, |
795 | 822 | end_time=end_time, |
796 | | - requests_per_second=StatusDistributionSummary.from_request_times( |
797 | | - request_types=total_types, |
798 | | - requests=[(req.start_time, req.end_time) for req in total], |
799 | | - distribution_type="rate", |
800 | | - ), |
801 | | - requests_concurrency=StatusDistributionSummary.from_request_times( |
802 | | - request_types=total_types, |
803 | | - requests=[(req.start_time, req.end_time) for req in total], |
804 | | - distribution_type="concurrency", |
805 | | - ), |
806 | | - request_latency=StatusDistributionSummary.from_values( |
807 | | - value_types=total_types, |
808 | | - values=[req.request_latency for req in total], |
809 | | - ), |
810 | | - prompt_token_count=StatusDistributionSummary.from_values( |
811 | | - value_types=list(total_types_with_prompt), |
812 | | - values=[req.prompt_tokens for req in total_with_prompt], |
813 | | - ), |
814 | | - output_token_count=StatusDistributionSummary.from_values( |
815 | | - value_types=list(total_types_with_output_first), |
816 | | - values=[req.output_tokens for req in total_with_output_first], |
817 | | - ), |
818 | | - time_to_first_token_ms=StatusDistributionSummary.from_values( |
819 | | - value_types=list(total_types_with_output_first), |
820 | | - values=[ |
821 | | - req.time_to_first_token_ms or 0 for req in total_with_output_first |
822 | | - ], |
823 | | - ), |
824 | | - time_per_output_token_ms=StatusDistributionSummary.from_values( |
825 | | - value_types=list(total_types_with_output_first), |
826 | | - values=[ |
827 | | - req.time_per_output_token_ms or 0 for req in total_with_output_first |
828 | | - ], |
829 | | - weights=[req.output_tokens for req in total_with_output_first], |
830 | | - ), |
831 | | - inter_token_latency_ms=StatusDistributionSummary.from_values( |
832 | | - value_types=list(total_types_with_output_multi), |
833 | | - values=[ |
834 | | - req.inter_token_latency_ms or 0 for req in total_with_output_multi |
835 | | - ], |
836 | | - weights=[req.output_tokens - 1 for req in total_with_output_multi], |
837 | | - ), |
838 | | - output_tokens_per_second=StatusDistributionSummary.from_iterable_request_times( |
839 | | - request_types=list(total_types_with_output_first), |
840 | | - requests=[ |
841 | | - (req.start_time, req.end_time) for req in total_with_output_first |
842 | | - ], |
843 | | - first_iter_times=[ |
844 | | - req.first_token_time or req.start_time |
845 | | - for req in total_with_output_first |
846 | | - ], |
847 | | - iter_counts=[req.output_tokens for req in total_with_output_first], |
848 | | - ), |
849 | | - tokens_per_second=StatusDistributionSummary.from_iterable_request_times( |
850 | | - request_types=list(total_types_with_output_first), |
851 | | - requests=[ |
852 | | - (req.start_time, req.end_time) for req in total_with_output_first |
853 | | - ], |
854 | | - first_iter_times=[ |
855 | | - req.first_token_time or req.start_time |
856 | | - for req in total_with_output_first |
857 | | - ], |
858 | | - iter_counts=[ |
859 | | - req.prompt_tokens + req.output_tokens |
860 | | - for req in total_with_output_first |
861 | | - ], |
862 | | - first_iter_counts=[ |
863 | | - req.prompt_tokens for req in total_with_output_first |
864 | | - ], |
| 823 | + metrics=GenerativeMetrics( |
| 824 | + request_per_second=StatusDistributionSummary.from_request_times( |
| 825 | + request_types=total_types, |
| 826 | + requests=[(req.start_time, req.end_time) for req in total], |
| 827 | + distribution_type="rate", |
| 828 | + ), |
| 829 | + request_concurrency=StatusDistributionSummary.from_request_times( |
| 830 | + request_types=total_types, |
| 831 | + requests=[(req.start_time, req.end_time) for req in total], |
| 832 | + distribution_type="concurrency", |
| 833 | + ), |
| 834 | + request_latency=StatusDistributionSummary.from_values( |
| 835 | + value_types=total_types, |
| 836 | + values=[req.request_latency for req in total], |
| 837 | + ), |
| 838 | + prompt_token_count=StatusDistributionSummary.from_values( |
| 839 | + value_types=list(total_types_with_prompt), |
| 840 | + values=[req.prompt_tokens for req in total_with_prompt], |
| 841 | + ), |
| 842 | + output_token_count=StatusDistributionSummary.from_values( |
| 843 | + value_types=list(total_types_with_output_first), |
| 844 | + values=[req.output_tokens for req in total_with_output_first], |
| 845 | + ), |
| 846 | + time_to_first_token_ms=StatusDistributionSummary.from_values( |
| 847 | + value_types=list(total_types_with_output_first), |
| 848 | + values=[ |
| 849 | + req.time_to_first_token_ms or 0 for req in total_with_output_first |
| 850 | + ], |
| 851 | + ), |
| 852 | + time_per_output_token_ms=StatusDistributionSummary.from_values( |
| 853 | + value_types=list(total_types_with_output_first), |
| 854 | + values=[ |
| 855 | + req.time_per_output_token_ms or 0 for req in total_with_output_first |
| 856 | + ], |
| 857 | + weights=[req.output_tokens for req in total_with_output_first], |
| 858 | + ), |
| 859 | + inter_token_latency_ms=StatusDistributionSummary.from_values( |
| 860 | + value_types=list(total_types_with_output_multi), |
| 861 | + values=[ |
| 862 | + req.inter_token_latency_ms or 0 for req in total_with_output_multi |
| 863 | + ], |
| 864 | + weights=[req.output_tokens - 1 for req in total_with_output_multi], |
| 865 | + ), |
| 866 | + output_tokens_per_second=StatusDistributionSummary.from_iterable_request_times( |
| 867 | + request_types=list(total_types_with_output_first), |
| 868 | + requests=[ |
| 869 | + (req.start_time, req.end_time) for req in total_with_output_first |
| 870 | + ], |
| 871 | + first_iter_times=[ |
| 872 | + req.first_token_time or req.start_time |
| 873 | + for req in total_with_output_first |
| 874 | + ], |
| 875 | + iter_counts=[req.output_tokens for req in total_with_output_first], |
| 876 | + ), |
| 877 | + tokens_per_second=StatusDistributionSummary.from_iterable_request_times( |
| 878 | + request_types=list(total_types_with_output_first), |
| 879 | + requests=[ |
| 880 | + (req.start_time, req.end_time) for req in total_with_output_first |
| 881 | + ], |
| 882 | + first_iter_times=[ |
| 883 | + req.first_token_time or req.start_time |
| 884 | + for req in total_with_output_first |
| 885 | + ], |
| 886 | + iter_counts=[ |
| 887 | + req.prompt_tokens + req.output_tokens |
| 888 | + for req in total_with_output_first |
| 889 | + ], |
| 890 | + first_iter_counts=[ |
| 891 | + req.prompt_tokens for req in total_with_output_first |
| 892 | + ], |
| 893 | + ), |
865 | 894 | ), |
866 | 895 | ) |
0 commit comments