From 7e90a738389bf718091663f2971fc44adc622cea Mon Sep 17 00:00:00 2001 From: dalthecow Date: Tue, 4 Mar 2025 19:44:03 -0500 Subject: [PATCH 1/9] set up data for ui --- src/guidellm/core/result.py | 16 +++ src/guidellm/main.py | 4 +- src/guidellm/utils/__init__.py | 2 + src/guidellm/utils/generate_ui_data.py | 138 +++++++++++++++++++++++++ 4 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 src/guidellm/utils/generate_ui_data.py diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py index f218784c..8942b675 100644 --- a/src/guidellm/core/result.py +++ b/src/guidellm/core/result.py @@ -403,6 +403,22 @@ def output_token_throughput(self) -> float: return total_tokens / self.duration + @property + def output_token_throughput_distribution(self) -> Distribution: + """ + Get the distribution for output token throughput. + + :return: The distribution of output token throughput. + :rtype: Distribution + """ + throughputs = [] + for r in self.results: + duration = (r.end_time or 0) - (r.start_time or 0) + if duration > 0: + throughputs.append(r.output_token_count / duration) + + return Distribution(data=throughputs) + @property def prompt_token_distribution(self) -> Distribution: """ diff --git a/src/guidellm/main.py b/src/guidellm/main.py index 4016ecec..01b744d2 100644 --- a/src/guidellm/main.py +++ b/src/guidellm/main.py @@ -14,6 +14,7 @@ ) from guidellm.request.base import RequestGenerator from guidellm.utils import BenchmarkReportProgress, cli_params +from guidellm.utils.generate_ui_data import generate_ui_api_data __all__ = ["generate_benchmark_report"] @@ -183,7 +184,6 @@ def generate_benchmark_report_cli( cont_refresh_table=enable_continuous_refresh, ) - def generate_benchmark_report( target: str, backend: BackendEnginePublic, @@ -285,6 +285,8 @@ def generate_benchmark_report( ) report = asyncio.run(_run_executor_for_result(executor)) + generate_ui_api_data(report) + # Save and print report guidance_report = GuidanceReport() guidance_report.benchmarks.append(report) diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 2fdd8ca8..96c02049 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -1,4 +1,5 @@ from .injector import create_report, inject_data +from .generate_ui_data import generate_ui_api_data from .progress import BenchmarkReportProgress from .text import ( clean_text, @@ -24,6 +25,7 @@ "clean_text", "create_report", "filter_text", + "generate_ui_api_data", "inject_data", "is_path", "is_path_like", diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py new file mode 100644 index 00000000..e9415cc7 --- /dev/null +++ b/src/guidellm/utils/generate_ui_data.py @@ -0,0 +1,138 @@ +import os +import json +import random +from typing import Any, Dict, List +from guidellm.core.distribution import Distribution +from guidellm.core import TextGenerationBenchmarkReport, TextGenerationBenchmark + +def generate_metric_report(dist: Distribution, metric_label: str, n_buckets: int = 18): + total = dist.__len__() + mean = dist.mean + median = dist.median + minv = dist.min + maxv = dist.max + std_dev = dist.std_deviation + + pvals = dist.percentiles([50, 90, 95, 99]) + + percentile_list = [ + {"percentile": "p50", "value": pvals[0]}, + {"percentile": "p90", "value": pvals[1]}, + {"percentile": "p95", "value": pvals[2]}, + {"percentile": "p99", "value": pvals[3]}, + ] + + if dist.range == 0: + buckets = [{"value": minv, "count": total}] + bucket_width = 0 + else: + bucket_width = dist.range / n_buckets + bucket_counts = [0] * n_buckets + + for val in dist.data: + + idx = int((val - minv) // bucket_width) + if idx == n_buckets: + idx = n_buckets - 1 + bucket_counts[idx] += 1 + + buckets = [] + for i, count in enumerate(bucket_counts): + bucket_start = minv + i * bucket_width + buckets.append({ + "value": bucket_start, + "count": count + }) + + return { + metric_label: { + "statistics": { + "total": total, + "mean": mean, + "median": median, + "min": minv, + "max": maxv, + "std": std_dev, + }, + "percentiles": percentile_list, + "buckets": buckets, + "bucketWidth": bucket_width, + } + } + +def generate_run_info(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: + timestamp = max(map(lambda bm: bm.end_time, report.benchmarks)) + return { + "model": { + "name": report.args.get('model', 'N/A'), + "size": 0 + }, + "task": "N/A", + "dataset": "N/A", + "timestamp": timestamp + } + +def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: + all_prompt_token_data = [data for benchmark in report.benchmarks for data in benchmark.prompt_token_distribution.data] + all_prompt_token_distribution = Distribution(data=all_prompt_token_data) + all_output_token_data = [data for benchmark in report.benchmarks for data in benchmark.output_token_distribution.data] + all_output_token_distribution = Distribution(data=all_output_token_data) + + prompt_token_data = generate_metric_report(all_prompt_token_distribution, "tokenDistributions") + prompt_token_samples = [result.prompt for benchmark in report.benchmarks for result in benchmark.results] + sample_prompts = random.sample(prompt_token_samples, min(5, len(prompt_token_samples))) + output_token_data = generate_metric_report(all_output_token_distribution, "tokenDistributions") + output_token_samples = [result.output for benchmark in report.benchmarks for result in benchmark.results] + sample_outputs = random.sample(output_token_samples, min(5, len(output_token_samples))) + return { + "prompts": { + "samples": sample_prompts, + **prompt_token_data + }, + "generation": { + "samples": sample_outputs, + **output_token_data + }, + "server": { + "target": report.args.get('target', 'N/A') + } + } + +def generate_benchmark_json(bm: TextGenerationBenchmark) -> Dict[str, Any]: + ttft_dist_ms = Distribution(data=[val * 1000 for val in bm.ttft_distribution.data]) + ttft_data = generate_metric_report(ttft_dist_ms, 'ttft') + tpot_dist_ms = Distribution(data=[val * 1000 for val in bm.itl_distribution.data]) + tpot_data = generate_metric_report(tpot_dist_ms, 'tpot') + throughput_dist_ms = Distribution(data=[val * 1000 for val in bm.output_token_throughput_distribution.data]) + throughput_data = generate_metric_report(throughput_dist_ms, 'throughput') + latency_dist_ms = Distribution(data=[val * 1000 for val in bm.request_latency_distribution.data]) + time_per_request_data = generate_metric_report(latency_dist_ms, 'timePerRequest') + return { + "requestsPerSecond": bm.completed_request_rate, + **ttft_data, + **tpot_data, + **throughput_data, + **time_per_request_data, + } + +def generate_benchmarks_json(benchmarks: List[TextGenerationBenchmark]): + benchmark_report_json = [] + for benchmark in benchmarks: + benchmarks_report = generate_benchmark_json(benchmark) + benchmark_report_json.append(benchmarks_report) + return benchmark_report_json + +def generate_ui_api_data(report: TextGenerationBenchmarkReport): + run_info_json = generate_run_info(report) + workload_details_json = generate_workload_details(report) + benchmarks_json = generate_benchmarks_json(report.benchmarks) + os.makedirs("ben_test", exist_ok=True) + # generate json files based off of api specs, https://codepen.io/dalthecow/pen/bNGVQbq, for consumption by UI + with open("ben_test/run_info.json", "w") as f: + json.dump(run_info_json, f, indent=2) + with open("ben_test/workload_details.json", "w") as f: + json.dump(workload_details_json, f, indent=2) + with open("ben_test/benchmarks.json", "w") as f: + json.dump(benchmarks_json, f, indent=2) + + print("Reports saved to run_info.json, workload_details.json, benchmarks.json") \ No newline at end of file From f870c2000bfd5ddaa6624e240e3631071f4c2805 Mon Sep 17 00:00:00 2001 From: dalthecow Date: Tue, 4 Mar 2025 21:17:34 -0500 Subject: [PATCH 2/9] add in request over time calculation --- src/guidellm/utils/generate_ui_data.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index e9415cc7..be298619 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -72,6 +72,18 @@ def generate_run_info(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: "timestamp": timestamp } +def generate_request_over_time_data(benchmarks: List[TextGenerationBenchmark]) -> List[Dict[str, Any]]: + request_over_time_results = [] + for benchmark in benchmarks: + # compare benchmark start time to text generation result end time + all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] + request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) + request_distribution = Distribution(data=request_over_time_values) + result = generate_metric_report(request_distribution, "requestsOverTime") + request_over_time_results.append(result["requestsOverTime"]) + return request_over_time_results + + def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: all_prompt_token_data = [data for benchmark in report.benchmarks for data in benchmark.prompt_token_distribution.data] all_prompt_token_distribution = Distribution(data=all_prompt_token_data) @@ -84,6 +96,9 @@ def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str output_token_data = generate_metric_report(all_output_token_distribution, "tokenDistributions") output_token_samples = [result.output for benchmark in report.benchmarks for result in benchmark.results] sample_outputs = random.sample(output_token_samples, min(5, len(output_token_samples))) + + request_over_time_results = generate_request_over_time_data(report.benchmarks) + return { "prompts": { "samples": sample_prompts, @@ -93,6 +108,7 @@ def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str "samples": sample_outputs, **output_token_data }, + "requestsOverTime": request_over_time_results, "server": { "target": report.args.get('target', 'N/A') } From d1bbc0c1b2b8f7a2a0183ffb25e27a83ae18aa10 Mon Sep 17 00:00:00 2001 From: dalthecow Date: Fri, 7 Mar 2025 15:12:19 -0500 Subject: [PATCH 3/9] update data generation to better handle sample strings, other small fixes --- src/guidellm/utils/generate_ui_data.py | 35 ++++++++++++++++++-------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index be298619..97ad8118 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -68,7 +68,9 @@ def generate_run_info(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: "size": 0 }, "task": "N/A", - "dataset": "N/A", + "dataset": { + "name": "N/A" + }, "timestamp": timestamp } @@ -93,10 +95,13 @@ def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str prompt_token_data = generate_metric_report(all_prompt_token_distribution, "tokenDistributions") prompt_token_samples = [result.prompt for benchmark in report.benchmarks for result in benchmark.results] sample_prompts = random.sample(prompt_token_samples, min(5, len(prompt_token_samples))) + sample_prompts = list(map(lambda prompt: prompt.replace("\n", " ").replace("\"", "'"), sample_prompts)) output_token_data = generate_metric_report(all_output_token_distribution, "tokenDistributions") output_token_samples = [result.output for benchmark in report.benchmarks for result in benchmark.results] sample_outputs = random.sample(output_token_samples, min(5, len(output_token_samples))) + sample_outputs = list(map(lambda output: output.replace("\n", " ").replace("\"", "'"), sample_outputs)) + request_over_time_results = generate_request_over_time_data(report.benchmarks) return { @@ -104,7 +109,7 @@ def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str "samples": sample_prompts, **prompt_token_data }, - "generation": { + "generations": { "samples": sample_outputs, **output_token_data }, @@ -138,17 +143,25 @@ def generate_benchmarks_json(benchmarks: List[TextGenerationBenchmark]): benchmark_report_json.append(benchmarks_report) return benchmark_report_json +def generate_js_variable(variable_name: str, data: dict) -> str: + json_data = json.dumps(data, indent=2) + return f'`window.{variable_name} = {json_data};`' # Wrap in quotes + def generate_ui_api_data(report: TextGenerationBenchmarkReport): - run_info_json = generate_run_info(report) - workload_details_json = generate_workload_details(report) - benchmarks_json = generate_benchmarks_json(report.benchmarks) + run_info_data = generate_run_info(report) + workload_details_data = generate_workload_details(report) + benchmarks_data = generate_benchmarks_json(report.benchmarks) + run_info_script = generate_js_variable("run_info", run_info_data) + workload_details_script = generate_js_variable("workload_details", workload_details_data) + benchmarks_script = generate_js_variable("benchmarks", benchmarks_data) + os.makedirs("ben_test", exist_ok=True) # generate json files based off of api specs, https://codepen.io/dalthecow/pen/bNGVQbq, for consumption by UI - with open("ben_test/run_info.json", "w") as f: - json.dump(run_info_json, f, indent=2) - with open("ben_test/workload_details.json", "w") as f: - json.dump(workload_details_json, f, indent=2) - with open("ben_test/benchmarks.json", "w") as f: - json.dump(benchmarks_json, f, indent=2) + with open("ben_test/run_info.js", "w") as f: + f.write(run_info_script) + with open("ben_test/workload_details.js", "w") as f: + f.write(workload_details_script) + with open("ben_test/benchmarks.js", "w") as f: + f.write(benchmarks_script) print("Reports saved to run_info.json, workload_details.json, benchmarks.json") \ No newline at end of file From d59cadafee7c73ce2fbca5829f599364ae4ed3bb Mon Sep 17 00:00:00 2001 From: dalthecow Date: Tue, 11 Mar 2025 13:46:15 -0400 Subject: [PATCH 4/9] hack changes together to get values for request over time data, wip --- src/guidellm/utils/generate_ui_data.py | 74 +++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index 97ad8118..8b1c7f67 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -1,6 +1,7 @@ import os import json import random +import math from typing import Any, Dict, List from guidellm.core.distribution import Distribution from guidellm.core import TextGenerationBenchmarkReport, TextGenerationBenchmark @@ -74,7 +75,12 @@ def generate_run_info(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: "timestamp": timestamp } +def linearly_interpolate_value(target_input, lower_input, lower_output, upperInput, upper_output): + fraction = (target_input - lower_input) / (upperInput - lower_input) + return lower_output + fraction * (upper_output - lower_output) + def generate_request_over_time_data(benchmarks: List[TextGenerationBenchmark]) -> List[Dict[str, Any]]: + request_over_time_results = [] for benchmark in benchmarks: # compare benchmark start time to text generation result end time @@ -82,8 +88,59 @@ def generate_request_over_time_data(benchmarks: List[TextGenerationBenchmark]) - request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) request_distribution = Distribution(data=request_over_time_values) result = generate_metric_report(request_distribution, "requestsOverTime") - request_over_time_results.append(result["requestsOverTime"]) - return request_over_time_results + result["requestsPerSecond"] = benchmark.completed_request_rate + request_over_time_results.append(result) + + if len(benchmarks) == 1: + return request_over_time_results + + request_over_time_raw = [] + sorted_bm = sorted(benchmarks, key=lambda bm: bm.completed_request_rate) + for benchmark in sorted_bm: + # compare benchmark start time to text generation result end time + all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] + request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) + request_at_rps = { "rps": benchmark.completed_request_rate, "requests_over_time": request_over_time_values } + request_over_time_raw.append(request_at_rps) + + rps_values = [request_obj["rps"] for request_obj in request_over_time_raw] + rps_range = list(range(math.ceil(min(rps_values)), math.ceil(max(rps_values)))) + interpolated_request_values = [] + lower_rps_index = 0 + for rps in rps_range: + if rps > rps_values[lower_rps_index + 1]: lower_rps_index += 1 + if rps == rps_values[lower_rps_index]: + interpolated_request_values.append({ + "requests_per_second": rps, + "requests_over_time": request_over_time_raw[lower_rps_index]["requests_over_time"][:] + }) + lower_rps_index += 1 + elif rps < rps_values[lower_rps_index + 1]: + interpolated_requests_at_new_rps = [] + for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): + lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] + upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] + new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) + interpolated_requests_at_new_rps.append(new_value) + interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) + elif rps > rps_values[lower_rps_index + 1]: + while rps > rps_values[lower_rps_index + 1]: + lower_rps_index += 1 + interpolated_requests_at_new_rps = [] + for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): + lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] + upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] + new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) + interpolated_requests_at_new_rps.append(new_value) + interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) + interpolated_request_over_time_results = [] + for request_value in interpolated_request_values: + request_distribution = Distribution(data=request_value["requests_over_time"]) + result = generate_metric_report(request_distribution, "requestsOverTime") + result["requestsPerSecond"] = request_value["requests_per_second"] + interpolated_request_over_time_results.append(result) + + return interpolated_request_over_time_results def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: @@ -93,13 +150,18 @@ def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str all_output_token_distribution = Distribution(data=all_output_token_data) prompt_token_data = generate_metric_report(all_prompt_token_distribution, "tokenDistributions") - prompt_token_samples = [result.prompt for benchmark in report.benchmarks for result in benchmark.results] - sample_prompts = random.sample(prompt_token_samples, min(5, len(prompt_token_samples))) - sample_prompts = list(map(lambda prompt: prompt.replace("\n", " ").replace("\"", "'"), sample_prompts)) output_token_data = generate_metric_report(all_output_token_distribution, "tokenDistributions") + + prompt_token_samples = [result.prompt for benchmark in report.benchmarks for result in benchmark.results] output_token_samples = [result.output for benchmark in report.benchmarks for result in benchmark.results] - sample_outputs = random.sample(output_token_samples, min(5, len(output_token_samples))) + num_samples = min(5, len(prompt_token_samples), len(output_token_samples)) + sample_indices = random.sample(range(len(prompt_token_samples)), num_samples) + + sample_prompts = [prompt_token_samples[i] for i in sample_indices] + sample_prompts = list(map(lambda prompt: prompt.replace("\n", " ").replace("\"", "'"), sample_prompts)) + + sample_outputs = [output_token_samples[i] for i in sample_indices] sample_outputs = list(map(lambda output: output.replace("\n", " ").replace("\"", "'"), sample_outputs)) request_over_time_results = generate_request_over_time_data(report.benchmarks) From 76fc2b42b7b65a1c6cfa3714d0d01ff1044d794a Mon Sep 17 00:00:00 2001 From: dalthecow Date: Wed, 19 Mar 2025 16:14:28 -0400 Subject: [PATCH 5/9] add interpolation of benchmark metrics by rps, remove interpolation of request over time data and use raw, refactor and test interpolation functionality --- src/guidellm/utils/__init__.py | 7 +- src/guidellm/utils/generate_ui_data.py | 236 ++++++++++++++++--------- src/guidellm/utils/interpolation.py | 89 ++++++++++ tests/unit/utils/test_interpolation.py | 26 +++ 4 files changed, 277 insertions(+), 81 deletions(-) create mode 100644 src/guidellm/utils/interpolation.py create mode 100644 tests/unit/utils/test_interpolation.py diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 96c02049..81cf580f 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -1,5 +1,6 @@ -from .injector import create_report, inject_data from .generate_ui_data import generate_ui_api_data +from .interpolation import linear_interpolate, interpolate_measurements, interpolate_data_points, stretch_list +from .injector import create_report, inject_data from .progress import BenchmarkReportProgress from .text import ( clean_text, @@ -27,9 +28,12 @@ "filter_text", "generate_ui_api_data", "inject_data", + "interpolate_data_points", + "interpolate_measurements", "is_path", "is_path_like", "is_url", + "linear_interpolate", "load_text", "load_text_lines", "load_transformers_dataset", @@ -39,4 +43,5 @@ "resolve_transformers_dataset_split", "split_lines_by_punctuation", "split_text", + "stretch_list", ] diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index 8b1c7f67..a450fd94 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -5,6 +5,7 @@ from typing import Any, Dict, List from guidellm.core.distribution import Distribution from guidellm.core import TextGenerationBenchmarkReport, TextGenerationBenchmark +from guidellm.utils.interpolation import interpolate_data_points def generate_metric_report(dist: Distribution, metric_label: str, n_buckets: int = 18): total = dist.__len__() @@ -61,8 +62,8 @@ def generate_metric_report(dist: Distribution, metric_label: str, n_buckets: int } } -def generate_run_info(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: - timestamp = max(map(lambda bm: bm.end_time, report.benchmarks)) +def generate_run_info(report: TextGenerationBenchmarkReport, benchmarks: List[TextGenerationBenchmark]) -> Dict[str, Any]: + timestamp = max(bm.start_time for bm in benchmarks if bm.start_time is not None) return { "model": { "name": report.args.get('model', 'N/A'), @@ -80,91 +81,109 @@ def linearly_interpolate_value(target_input, lower_input, lower_output, upperInp return lower_output + fraction * (upper_output - lower_output) def generate_request_over_time_data(benchmarks: List[TextGenerationBenchmark]) -> List[Dict[str, Any]]: + filtered_benchmarks = filter(lambda bm: bm.start_time is not None, benchmarks) + sorted_benchmarks = list(sorted(filtered_benchmarks, key=lambda bm: bm.start_time)) + min_start_time = sorted_benchmarks[0].start_time - request_over_time_results = [] - for benchmark in benchmarks: - # compare benchmark start time to text generation result end time - all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] - request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) - request_distribution = Distribution(data=request_over_time_values) - result = generate_metric_report(request_distribution, "requestsOverTime") - result["requestsPerSecond"] = benchmark.completed_request_rate - request_over_time_results.append(result) + all_request_times = [ + result.start_time - min_start_time + for benchmark in sorted_benchmarks + for result in benchmark.results + if result.start_time is not None + ] - if len(benchmarks) == 1: - return request_over_time_results + request_distribution = Distribution(data=all_request_times) + final_result = generate_metric_report(request_distribution, "requestsOverTime") + return { "numBenchmarks": len(sorted_benchmarks), **final_result } + +# def generate_request_over_time_data_per_benchmark(benchmarks: List[TextGenerationBenchmark]) -> List[Dict[str, Any]]: + +# request_over_time_results = [] +# for benchmark in benchmarks: +# # compare benchmark start time to text generation result end time +# all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] +# request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) +# request_distribution = Distribution(data=request_over_time_values) +# result = generate_metric_report(request_distribution, "requestsOverTime") +# result["requestsPerSecond"] = benchmark.completed_request_rate +# request_over_time_results.append(result) + +# if len(benchmarks) == 1: +# return request_over_time_results - request_over_time_raw = [] - sorted_bm = sorted(benchmarks, key=lambda bm: bm.completed_request_rate) - for benchmark in sorted_bm: - # compare benchmark start time to text generation result end time - all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] - request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) - request_at_rps = { "rps": benchmark.completed_request_rate, "requests_over_time": request_over_time_values } - request_over_time_raw.append(request_at_rps) - - rps_values = [request_obj["rps"] for request_obj in request_over_time_raw] - rps_range = list(range(math.ceil(min(rps_values)), math.ceil(max(rps_values)))) - interpolated_request_values = [] - lower_rps_index = 0 - for rps in rps_range: - if rps > rps_values[lower_rps_index + 1]: lower_rps_index += 1 - if rps == rps_values[lower_rps_index]: - interpolated_request_values.append({ - "requests_per_second": rps, - "requests_over_time": request_over_time_raw[lower_rps_index]["requests_over_time"][:] - }) - lower_rps_index += 1 - elif rps < rps_values[lower_rps_index + 1]: - interpolated_requests_at_new_rps = [] - for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): - lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] - upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] - new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) - interpolated_requests_at_new_rps.append(new_value) - interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) - elif rps > rps_values[lower_rps_index + 1]: - while rps > rps_values[lower_rps_index + 1]: - lower_rps_index += 1 - interpolated_requests_at_new_rps = [] - for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): - lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] - upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] - new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) - interpolated_requests_at_new_rps.append(new_value) - interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) - interpolated_request_over_time_results = [] - for request_value in interpolated_request_values: - request_distribution = Distribution(data=request_value["requests_over_time"]) - result = generate_metric_report(request_distribution, "requestsOverTime") - result["requestsPerSecond"] = request_value["requests_per_second"] - interpolated_request_over_time_results.append(result) - - return interpolated_request_over_time_results - - -def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str, Any]: - all_prompt_token_data = [data for benchmark in report.benchmarks for data in benchmark.prompt_token_distribution.data] +# request_over_time_raw = [] +# sorted_bm = sorted(benchmarks, key=lambda bm: bm.completed_request_rate) +# for benchmark in sorted_bm: +# # compare benchmark start time to text generation result end time +# all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] +# request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) +# request_at_rps = { "rps": benchmark.completed_request_rate, "requests_over_time": request_over_time_values } +# request_over_time_raw.append(request_at_rps) + +# rps_values = [request_obj["rps"] for request_obj in request_over_time_raw] +# rps_range = list(range(math.ceil(min(rps_values)), math.ceil(max(rps_values)))) +# interpolated_request_values = [] +# lower_rps_index = 0 +# for rps in rps_range: +# if rps > rps_values[lower_rps_index + 1]: lower_rps_index += 1 +# if rps == rps_values[lower_rps_index]: +# interpolated_request_values.append({ +# "requests_per_second": rps, +# "requests_over_time": request_over_time_raw[lower_rps_index]["requests_over_time"][:] +# }) +# lower_rps_index += 1 +# elif rps < rps_values[lower_rps_index + 1]: +# interpolated_requests_at_new_rps = [] +# for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): +# lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] +# upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] +# new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) +# interpolated_requests_at_new_rps.append(new_value) +# interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) +# elif rps > rps_values[lower_rps_index + 1]: +# while rps > rps_values[lower_rps_index + 1]: +# lower_rps_index += 1 +# interpolated_requests_at_new_rps = [] +# for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): +# lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] +# upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] +# new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) +# interpolated_requests_at_new_rps.append(new_value) +# interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) +# interpolated_request_over_time_results = [] +# for request_value in interpolated_request_values: +# request_distribution = Distribution(data=request_value["requests_over_time"]) +# result = generate_metric_report(request_distribution, "requestsOverTime") +# result["requestsPerSecond"] = request_value["requests_per_second"] +# interpolated_request_over_time_results.append(result) +# return { "rawData": request_over_time_results, "interpolatedData": interpolated_request_over_time_results } + + +def generate_workload_details(report: TextGenerationBenchmarkReport, benchmarks: List[TextGenerationBenchmark]) -> Dict[str, Any]: + all_prompt_token_data = [data for benchmark in benchmarks for data in benchmark.prompt_token_distribution.data] all_prompt_token_distribution = Distribution(data=all_prompt_token_data) - all_output_token_data = [data for benchmark in report.benchmarks for data in benchmark.output_token_distribution.data] + all_output_token_data = [data for benchmark in benchmarks for data in benchmark.output_token_distribution.data] all_output_token_distribution = Distribution(data=all_output_token_data) prompt_token_data = generate_metric_report(all_prompt_token_distribution, "tokenDistributions") output_token_data = generate_metric_report(all_output_token_distribution, "tokenDistributions") - prompt_token_samples = [result.prompt for benchmark in report.benchmarks for result in benchmark.results] - output_token_samples = [result.output for benchmark in report.benchmarks for result in benchmark.results] + prompt_token_samples = [result.prompt for benchmark in benchmarks for result in benchmark.results] + output_token_samples = [result.output for benchmark in benchmarks for result in benchmark.results] num_samples = min(5, len(prompt_token_samples), len(output_token_samples)) sample_indices = random.sample(range(len(prompt_token_samples)), num_samples) sample_prompts = [prompt_token_samples[i] for i in sample_indices] + """ + Need a wholistic approach to parsing out characters in the prompt that don't covert well into the format we need + """ sample_prompts = list(map(lambda prompt: prompt.replace("\n", " ").replace("\"", "'"), sample_prompts)) sample_outputs = [output_token_samples[i] for i in sample_indices] sample_outputs = list(map(lambda output: output.replace("\n", " ").replace("\"", "'"), sample_outputs)) - request_over_time_results = generate_request_over_time_data(report.benchmarks) + request_over_time_results = generate_request_over_time_data(benchmarks) return { "prompts": { @@ -184,35 +203,92 @@ def generate_workload_details(report: TextGenerationBenchmarkReport) -> Dict[str def generate_benchmark_json(bm: TextGenerationBenchmark) -> Dict[str, Any]: ttft_dist_ms = Distribution(data=[val * 1000 for val in bm.ttft_distribution.data]) ttft_data = generate_metric_report(ttft_dist_ms, 'ttft') - tpot_dist_ms = Distribution(data=[val * 1000 for val in bm.itl_distribution.data]) - tpot_data = generate_metric_report(tpot_dist_ms, 'tpot') - throughput_dist_ms = Distribution(data=[val * 1000 for val in bm.output_token_throughput_distribution.data]) + itl_dist_ms = Distribution(data=[val * 1000 for val in bm.itl_distribution.data]) + itl_data = generate_metric_report(itl_dist_ms, 'tpot') + throughput_dist_ms = Distribution(data=bm.output_token_throughput_distribution.data) throughput_data = generate_metric_report(throughput_dist_ms, 'throughput') latency_dist_ms = Distribution(data=[val * 1000 for val in bm.request_latency_distribution.data]) - time_per_request_data = generate_metric_report(latency_dist_ms, 'timePerRequest') + latency__data = generate_metric_report(latency_dist_ms, 'timePerRequest') return { "requestsPerSecond": bm.completed_request_rate, + **itl_data, **ttft_data, - **tpot_data, **throughput_data, - **time_per_request_data, + **latency__data, } +def generate_interpolated_benchmarks(benchmarks: List[TextGenerationBenchmark]): + """ + Should we only use constant rate benchmarks here since synchronous and throughput runs might not be appropriate to lump in for interoplation across all rps? + + Other edge-case, what if rps doesn't span more than 1 whole rps even with multiple benchmarks + ex: 1.1, 1.3, 1.5, 2.1, 2.5, can interpolate at 2rps + or worse, 1.1, 1.4, 1.6, can't interpolate + """ + if len(benchmarks) == 1: + return [] + + sorted_benchmarks = sorted(benchmarks[:], key=lambda bm: bm.completed_request_rate) + rps_values = [bm.completed_request_rate for bm in sorted_benchmarks] + rps_range = list(range(math.ceil(min(rps_values)), math.ceil(max(rps_values)))) + + ttft_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.ttft_distribution.data), sorted_benchmarks)) + interpolated_ttft_data_by_rps = interpolate_data_points(ttft_data_by_rps, rps_range) + + itl_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.itl_distribution.data), sorted_benchmarks)) + interpolated_itl_data_by_rps = interpolate_data_points(itl_data_by_rps, rps_range) + + throughput_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.output_token_throughput_distribution.data), sorted_benchmarks)) + interpolated_throughput_data_by_rps = interpolate_data_points(throughput_data_by_rps, rps_range) + + latency_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.request_latency_distribution.data), sorted_benchmarks)) + interpolated_latency_data_by_rps = interpolate_data_points(latency_data_by_rps, rps_range) + + benchmark_json = [] + for i in range(len(interpolated_ttft_data_by_rps)): + rps, interpolated_ttft_data = interpolated_ttft_data_by_rps[i] + ttft_dist_ms = Distribution(data=[val * 1000 for val in interpolated_ttft_data]) + final_ttft_data = generate_metric_report(ttft_dist_ms, 'ttft') + + _, interpolated_itl_data = interpolated_itl_data_by_rps[i] + itl_dist_ms = Distribution(data=[val * 1000 for val in interpolated_itl_data]) + final_itl_data = generate_metric_report(itl_dist_ms, 'tpot') + + _, interpolated_throughput_data = interpolated_throughput_data_by_rps[i] + throughput_dist_ms = Distribution(data=interpolated_throughput_data) + final_throughput_data = generate_metric_report(throughput_dist_ms, 'throughput') + + _, interpolated_latency_data = interpolated_latency_data_by_rps[i] + latency_dist_ms = Distribution(data=[val * 1000 for val in interpolated_latency_data]) + final_latency_data = generate_metric_report(latency_dist_ms, 'timePerRequest') + + benchmark_json.append({ + "requestsPerSecond": rps, + **final_itl_data, + **final_ttft_data, + **final_throughput_data, + **final_latency_data, + }) + return benchmark_json + def generate_benchmarks_json(benchmarks: List[TextGenerationBenchmark]): - benchmark_report_json = [] + raw_benchmark_json = [] for benchmark in benchmarks: benchmarks_report = generate_benchmark_json(benchmark) - benchmark_report_json.append(benchmarks_report) - return benchmark_report_json + raw_benchmark_json.append(benchmarks_report) + interpolated_benchmark_json = generate_interpolated_benchmarks(benchmarks) + + return { "raw": raw_benchmark_json, "interpolated_by_rps": interpolated_benchmark_json } def generate_js_variable(variable_name: str, data: dict) -> str: json_data = json.dumps(data, indent=2) return f'`window.{variable_name} = {json_data};`' # Wrap in quotes def generate_ui_api_data(report: TextGenerationBenchmarkReport): - run_info_data = generate_run_info(report) - workload_details_data = generate_workload_details(report) - benchmarks_data = generate_benchmarks_json(report.benchmarks) + filtered_benchmarks = list(filter(lambda bm: bm.completed_request_rate > 0, report.benchmarks)) + run_info_data = generate_run_info(report, filtered_benchmarks) + workload_details_data = generate_workload_details(report, filtered_benchmarks) + benchmarks_data = generate_benchmarks_json(filtered_benchmarks) run_info_script = generate_js_variable("run_info", run_info_data) workload_details_script = generate_js_variable("workload_details", workload_details_data) benchmarks_script = generate_js_variable("benchmarks", benchmarks_data) diff --git a/src/guidellm/utils/interpolation.py b/src/guidellm/utils/interpolation.py new file mode 100644 index 00000000..ef19d8e4 --- /dev/null +++ b/src/guidellm/utils/interpolation.py @@ -0,0 +1,89 @@ +from typing import List, Tuple +import numpy as np + +def linear_interpolate(target: float, lower: Tuple[float, float], upper: Tuple[float, float]) -> float: + """ + Linearly interpolates a value at 'target' given two points. + If the target equals one of the bounds, the corresponding value is returned. + """ + lower_ref, lower_measurement = lower + upper_ref, upper_measurement = upper + + if upper_ref == lower_ref: + return lower_measurement + if target <= lower_ref: + return lower_measurement + if target >= upper_ref: + return upper_measurement + + t = (target - lower_ref) / (upper_ref - lower_ref) + return lower_measurement + t * (upper_measurement - lower_measurement) + +def stretch_list(arr: List[float], target_length: int): + if len(arr) == target_length: + return np.array(arr) + + original_x = np.linspace(0, 1, len(arr)) + target_x = np.linspace(0, 1, target_length) + stretched_arr = list(np.interp(target_x, original_x, arr)) + return stretched_arr + +def interpolate_measurements(target: float, lower_ref_measurements_pair: Tuple[float, List[float]], upper_ref_measurements_pair: Tuple[float, List[float]]) -> List[float]: + """ + Interpolates each corresponding measurement value between lower and upper benchmarks. + Assumes that lower_measurements and upper_measurements have the same length. + """ + lower_ref, lower_measurements = lower_ref_measurements_pair + upper_ref, upper_measurements = upper_ref_measurements_pair + + if len(lower_measurements) < len(upper_measurements): + lower_measurements = stretch_list(lower_measurements, len(upper_measurements)) + if len(lower_measurements) > len(upper_measurements): + upper_measurements = stretch_list(upper_measurements, len(lower_measurements)) + + return [ + linear_interpolate(target, (lower_ref, lower_measurements[i]), (upper_ref, upper_measurements[i])) + for i in range(len(lower_measurements)) + ] + +def interpolate_data_points(data_points: List[Tuple[float, List[float]]], + target_ref: List[float]) -> List[Tuple[float, List[float]]]: + """ + Given sorted data_points as tuples of (scalar, measurements) and a list of target scalar values, + interpolate the measurements for each target. + + The data_points must be sorted by the scalar value in ascending order. + Only target scalar values that fall within the min and max of the data_points are considered. + """ + if not data_points: + return [] + + lower_bound = data_points[0][0] + upper_bound = data_points[-1][0] + # Filter target_ref to only include values within the provided range. + valid_targets = [t for t in target_ref if lower_bound <= t <= upper_bound] + + interpolated_results = [] + # Pointer to the current lower data point index. + lower_idx = 0 + + for target in sorted(valid_targets): + # Advance the lower_idx until we find the correct interval. + while (lower_idx < len(data_points) - 1 and target > data_points[lower_idx + 1][0]): + lower_idx += 1 + + # If the target exactly matches a known scalar value, use its measurements. + if target == data_points[lower_idx][0]: + interpolated_results.append((target, data_points[lower_idx][1][:])) + # Otherwise, if target lies between two data points, interpolate. + elif lower_idx < len(data_points) - 1: + lower_ref, lower_measurements = data_points[lower_idx] + upper_ref, upper_measurements = data_points[lower_idx + 1] + interpolated = interpolate_measurements(target, (lower_ref, lower_measurements), + (upper_ref, upper_measurements)) + interpolated_results.append((target, interpolated)) + else: + # If for some reason target is above the highest known data point, ignore it. + continue + + return interpolated_results \ No newline at end of file diff --git a/tests/unit/utils/test_interpolation.py b/tests/unit/utils/test_interpolation.py new file mode 100644 index 00000000..a79504b5 --- /dev/null +++ b/tests/unit/utils/test_interpolation.py @@ -0,0 +1,26 @@ +import pytest +from guidellm.utils.interpolation import ( + linear_interpolate, + interpolate_measurements, + interpolate_data_points, + stretch_list +) + +@pytest.mark.smoke() +def test_linear_interpolate(): + assert linear_interpolate(2, (1, 4), (3, 6)) == 5 + + +@pytest.mark.smoke() +def test_stretch_list(): + assert stretch_list([1,3,5], 5) == [1,2,3,4,5] + +@pytest.mark.smoke() +def test_interpolate_measurements(): + assert interpolate_measurements(2, (1, [1,2,3,4,5]), (3, [2,3,4,5,6])) == [1.5,2.5,3.5,4.5,5.5] + assert interpolate_measurements(2, (1, [1,2,3,4,5]), (3, [5,4,3,2,1])) == [3,3,3,3,3] + + +@pytest.mark.smoke() +def test_interpolate_data_point(): + assert interpolate_data_points([(1, [1,2,3,4,5]), (3, [2,3,4,5,6]), (9, [5,6,7,8,9])], [1,2,3,4,5,6,7,8,9]) == [(1, [1,2,3,4,5]), (2, [1.5,2.5,3.5,4.5,5.5]), (3, [2,3,4,5,6]), (4, [2.5, 3.5, 4.5, 5.5, 6.5]), (5, [3, 4, 5, 6, 7]), (6, [3.5, 4.5, 5.5, 6.5, 7.5]), (7, [4, 5, 6, 7, 8]), (8, [4.5, 5.5, 6.5, 7.5, 8.5]), (9, [5, 6, 7, 8, 9])] \ No newline at end of file From 18335186431ab7c03690dd475a4b993bb6a9578c Mon Sep 17 00:00:00 2001 From: dalthecow Date: Wed, 19 Mar 2025 16:15:20 -0400 Subject: [PATCH 6/9] remove commented code --- src/guidellm/utils/generate_ui_data.py | 62 -------------------------- 1 file changed, 62 deletions(-) diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index a450fd94..1ccaf3de 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -96,68 +96,6 @@ def generate_request_over_time_data(benchmarks: List[TextGenerationBenchmark]) - final_result = generate_metric_report(request_distribution, "requestsOverTime") return { "numBenchmarks": len(sorted_benchmarks), **final_result } -# def generate_request_over_time_data_per_benchmark(benchmarks: List[TextGenerationBenchmark]) -> List[Dict[str, Any]]: - -# request_over_time_results = [] -# for benchmark in benchmarks: -# # compare benchmark start time to text generation result end time -# all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] -# request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) -# request_distribution = Distribution(data=request_over_time_values) -# result = generate_metric_report(request_distribution, "requestsOverTime") -# result["requestsPerSecond"] = benchmark.completed_request_rate -# request_over_time_results.append(result) - -# if len(benchmarks) == 1: -# return request_over_time_results - -# request_over_time_raw = [] -# sorted_bm = sorted(benchmarks, key=lambda bm: bm.completed_request_rate) -# for benchmark in sorted_bm: -# # compare benchmark start time to text generation result end time -# all_result_end_times = [result.end_time for result in benchmark.results if result.end_time is not None] -# request_over_time_values = list(map(lambda time: time - benchmark.start_time, all_result_end_times)) -# request_at_rps = { "rps": benchmark.completed_request_rate, "requests_over_time": request_over_time_values } -# request_over_time_raw.append(request_at_rps) - -# rps_values = [request_obj["rps"] for request_obj in request_over_time_raw] -# rps_range = list(range(math.ceil(min(rps_values)), math.ceil(max(rps_values)))) -# interpolated_request_values = [] -# lower_rps_index = 0 -# for rps in rps_range: -# if rps > rps_values[lower_rps_index + 1]: lower_rps_index += 1 -# if rps == rps_values[lower_rps_index]: -# interpolated_request_values.append({ -# "requests_per_second": rps, -# "requests_over_time": request_over_time_raw[lower_rps_index]["requests_over_time"][:] -# }) -# lower_rps_index += 1 -# elif rps < rps_values[lower_rps_index + 1]: -# interpolated_requests_at_new_rps = [] -# for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): -# lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] -# upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] -# new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) -# interpolated_requests_at_new_rps.append(new_value) -# interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) -# elif rps > rps_values[lower_rps_index + 1]: -# while rps > rps_values[lower_rps_index + 1]: -# lower_rps_index += 1 -# interpolated_requests_at_new_rps = [] -# for i in range(len(request_over_time_raw[lower_rps_index]["requests_over_time"])): -# lower_request = request_over_time_raw[lower_rps_index]["requests_over_time"][i] -# upper_request = request_over_time_raw[lower_rps_index + 1]["requests_over_time"][i] -# new_value = linearly_interpolate_value(rps, rps_values[lower_rps_index], lower_request, rps_values[lower_rps_index + 1], upper_request) -# interpolated_requests_at_new_rps.append(new_value) -# interpolated_request_values.append({ "requests_per_second": rps, "requests_over_time": interpolated_requests_at_new_rps }) -# interpolated_request_over_time_results = [] -# for request_value in interpolated_request_values: -# request_distribution = Distribution(data=request_value["requests_over_time"]) -# result = generate_metric_report(request_distribution, "requestsOverTime") -# result["requestsPerSecond"] = request_value["requests_per_second"] -# interpolated_request_over_time_results.append(result) -# return { "rawData": request_over_time_results, "interpolatedData": interpolated_request_over_time_results } - def generate_workload_details(report: TextGenerationBenchmarkReport, benchmarks: List[TextGenerationBenchmark]) -> Dict[str, Any]: all_prompt_token_data = [data for benchmark in benchmarks for data in benchmark.prompt_token_distribution.data] From 8c6325577acccd5c9dc0931e0e8a3e3ed48e12fb Mon Sep 17 00:00:00 2001 From: dalthecow Date: Thu, 20 Mar 2025 19:56:50 -0400 Subject: [PATCH 7/9] update json property name from snake_case to camelCase --- src/guidellm/utils/generate_ui_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index 1ccaf3de..1c8fd775 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -216,7 +216,7 @@ def generate_benchmarks_json(benchmarks: List[TextGenerationBenchmark]): raw_benchmark_json.append(benchmarks_report) interpolated_benchmark_json = generate_interpolated_benchmarks(benchmarks) - return { "raw": raw_benchmark_json, "interpolated_by_rps": interpolated_benchmark_json } + return { "raw": raw_benchmark_json, "interpolatedByRps": interpolated_benchmark_json } def generate_js_variable(variable_name: str, data: dict) -> str: json_data = json.dumps(data, indent=2) From 9691ef31e48b0b5d0197a52344222c9fdf9fdd9e Mon Sep 17 00:00:00 2001 From: dalthecow Date: Thu, 27 Mar 2025 14:10:03 -0400 Subject: [PATCH 8/9] include mode --- src/guidellm/utils/generate_ui_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index 1c8fd775..507d4910 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -133,6 +133,7 @@ def generate_workload_details(report: TextGenerationBenchmarkReport, benchmarks: **output_token_data }, "requestsOverTime": request_over_time_results, + "rateType": report.args["mode"], "server": { "target": report.args.get('target', 'N/A') } @@ -223,7 +224,7 @@ def generate_js_variable(variable_name: str, data: dict) -> str: return f'`window.{variable_name} = {json_data};`' # Wrap in quotes def generate_ui_api_data(report: TextGenerationBenchmarkReport): - filtered_benchmarks = list(filter(lambda bm: bm.completed_request_rate > 0, report.benchmarks)) + filtered_benchmarks = list(filter(lambda bm: (bm.completed_request_rate > 0) and bm.mode != 'throughput', report.benchmarks)) run_info_data = generate_run_info(report, filtered_benchmarks) workload_details_data = generate_workload_details(report, filtered_benchmarks) benchmarks_data = generate_benchmarks_json(filtered_benchmarks) From 656e6dd13e903eda58a3d1ca4d9bc020f9887f90 Mon Sep 17 00:00:00 2001 From: dalthecow Date: Thu, 3 Apr 2025 18:37:47 -0400 Subject: [PATCH 9/9] remove backen benchmark interoplation, hook up data generation to injector and output html report --- src/guidellm/config.py | 10 ++- src/guidellm/main.py | 4 +- src/guidellm/utils/__init__.py | 4 -- src/guidellm/utils/generate_ui_data.py | 80 ++++------------------- src/guidellm/utils/injector.py | 41 ++++-------- src/guidellm/utils/interpolation.py | 89 -------------------------- tests/unit/utils/test_interpolation.py | 26 -------- 7 files changed, 33 insertions(+), 221 deletions(-) delete mode 100644 src/guidellm/utils/interpolation.py delete mode 100644 tests/unit/utils/test_interpolation.py diff --git a/src/guidellm/config.py b/src/guidellm/config.py index 2d4e102a..52dcaf3e 100644 --- a/src/guidellm/config.py +++ b/src/guidellm/config.py @@ -32,9 +32,9 @@ class Environment(str, Enum): ENV_REPORT_MAPPING = { Environment.PROD: "https://guidellm.neuralmagic.com/local-report/index.html", - Environment.STAGING: "https://staging.guidellm.neuralmagic.com/local-report/index.html", - Environment.DEV: "https://dev.guidellm.neuralmagic.com/local-report/index.html", - Environment.LOCAL: "tests/dummy/report.html", + Environment.STAGING: "https://review.neuralmagic.com/guidellm-ui/staging/index.html", + Environment.DEV: "https://review.neuralmagic.com/guidellm-ui/dev/index.html", + Environment.LOCAL: "http://localhost:3000/index.html", } @@ -112,8 +112,6 @@ class ReportGenerationSettings(BaseModel): """ source: str = "" - report_html_match: str = "window.report_data = {};" - report_html_placeholder: str = "{}" class Settings(BaseSettings): @@ -138,7 +136,7 @@ class Settings(BaseSettings): ) # general settings - env: Environment = Environment.PROD + env: Environment = Environment.DEV request_timeout: int = 60 * 5 # 5 minutes request_http2: bool = True max_concurrency: int = 512 diff --git a/src/guidellm/main.py b/src/guidellm/main.py index ad29beeb..5bd79b04 100644 --- a/src/guidellm/main.py +++ b/src/guidellm/main.py @@ -2,6 +2,7 @@ from typing import Any, Literal, Mapping, Optional, Union, get_args import click +from guidellm.utils.injector import create_report from loguru import logger from transformers import AutoTokenizer # type: ignore[import-untyped] @@ -290,7 +291,8 @@ def generate_benchmark_report( ) report = asyncio.run(_run_executor_for_result(executor)) - generate_ui_api_data(report) + js_data = generate_ui_api_data(report) + create_report(js_data, 'guidellm_report') # Save and print report guidance_report = GuidanceReport() diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 81cf580f..8ca923b7 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -1,5 +1,4 @@ from .generate_ui_data import generate_ui_api_data -from .interpolation import linear_interpolate, interpolate_measurements, interpolate_data_points, stretch_list from .injector import create_report, inject_data from .progress import BenchmarkReportProgress from .text import ( @@ -28,12 +27,9 @@ "filter_text", "generate_ui_api_data", "inject_data", - "interpolate_data_points", - "interpolate_measurements", "is_path", "is_path_like", "is_url", - "linear_interpolate", "load_text", "load_text_lines", "load_transformers_dataset", diff --git a/src/guidellm/utils/generate_ui_data.py b/src/guidellm/utils/generate_ui_data.py index 507d4910..6181c0c3 100644 --- a/src/guidellm/utils/generate_ui_data.py +++ b/src/guidellm/utils/generate_ui_data.py @@ -5,7 +5,6 @@ from typing import Any, Dict, List from guidellm.core.distribution import Distribution from guidellm.core import TextGenerationBenchmarkReport, TextGenerationBenchmark -from guidellm.utils.interpolation import interpolate_data_points def generate_metric_report(dist: Distribution, metric_label: str, n_buckets: int = 18): total = dist.__len__() @@ -76,10 +75,6 @@ def generate_run_info(report: TextGenerationBenchmarkReport, benchmarks: List[Te "timestamp": timestamp } -def linearly_interpolate_value(target_input, lower_input, lower_output, upperInput, upper_output): - fraction = (target_input - lower_input) / (upperInput - lower_input) - return lower_output + fraction * (upper_output - lower_output) - def generate_request_over_time_data(benchmarks: List[TextGenerationBenchmark]) -> List[Dict[str, Any]]: filtered_benchmarks = filter(lambda bm: bm.start_time is not None, benchmarks) sorted_benchmarks = list(sorted(filtered_benchmarks, key=lambda bm: bm.start_time)) @@ -106,7 +101,7 @@ def generate_workload_details(report: TextGenerationBenchmarkReport, benchmarks: prompt_token_data = generate_metric_report(all_prompt_token_distribution, "tokenDistributions") output_token_data = generate_metric_report(all_output_token_distribution, "tokenDistributions") - prompt_token_samples = [result.prompt for benchmark in benchmarks for result in benchmark.results] + prompt_token_samples = [result.request.prompt for benchmark in benchmarks for result in benchmark.results] output_token_samples = [result.output for benchmark in benchmarks for result in benchmark.results] num_samples = min(5, len(prompt_token_samples), len(output_token_samples)) @@ -140,9 +135,9 @@ def generate_workload_details(report: TextGenerationBenchmarkReport, benchmarks: } def generate_benchmark_json(bm: TextGenerationBenchmark) -> Dict[str, Any]: - ttft_dist_ms = Distribution(data=[val * 1000 for val in bm.ttft_distribution.data]) + ttft_dist_ms = Distribution(data=bm.ttft_distribution.data) ttft_data = generate_metric_report(ttft_dist_ms, 'ttft') - itl_dist_ms = Distribution(data=[val * 1000 for val in bm.itl_distribution.data]) + itl_dist_ms = Distribution(data=bm.itl_distribution.data) itl_data = generate_metric_report(itl_dist_ms, 'tpot') throughput_dist_ms = Distribution(data=bm.output_token_throughput_distribution.data) throughput_data = generate_metric_report(throughput_dist_ms, 'throughput') @@ -156,72 +151,17 @@ def generate_benchmark_json(bm: TextGenerationBenchmark) -> Dict[str, Any]: **latency__data, } -def generate_interpolated_benchmarks(benchmarks: List[TextGenerationBenchmark]): - """ - Should we only use constant rate benchmarks here since synchronous and throughput runs might not be appropriate to lump in for interoplation across all rps? - - Other edge-case, what if rps doesn't span more than 1 whole rps even with multiple benchmarks - ex: 1.1, 1.3, 1.5, 2.1, 2.5, can interpolate at 2rps - or worse, 1.1, 1.4, 1.6, can't interpolate - """ - if len(benchmarks) == 1: - return [] - - sorted_benchmarks = sorted(benchmarks[:], key=lambda bm: bm.completed_request_rate) - rps_values = [bm.completed_request_rate for bm in sorted_benchmarks] - rps_range = list(range(math.ceil(min(rps_values)), math.ceil(max(rps_values)))) - - ttft_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.ttft_distribution.data), sorted_benchmarks)) - interpolated_ttft_data_by_rps = interpolate_data_points(ttft_data_by_rps, rps_range) - - itl_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.itl_distribution.data), sorted_benchmarks)) - interpolated_itl_data_by_rps = interpolate_data_points(itl_data_by_rps, rps_range) - - throughput_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.output_token_throughput_distribution.data), sorted_benchmarks)) - interpolated_throughput_data_by_rps = interpolate_data_points(throughput_data_by_rps, rps_range) - - latency_data_by_rps = list(map(lambda bm: (bm.completed_request_rate, bm.request_latency_distribution.data), sorted_benchmarks)) - interpolated_latency_data_by_rps = interpolate_data_points(latency_data_by_rps, rps_range) - - benchmark_json = [] - for i in range(len(interpolated_ttft_data_by_rps)): - rps, interpolated_ttft_data = interpolated_ttft_data_by_rps[i] - ttft_dist_ms = Distribution(data=[val * 1000 for val in interpolated_ttft_data]) - final_ttft_data = generate_metric_report(ttft_dist_ms, 'ttft') - - _, interpolated_itl_data = interpolated_itl_data_by_rps[i] - itl_dist_ms = Distribution(data=[val * 1000 for val in interpolated_itl_data]) - final_itl_data = generate_metric_report(itl_dist_ms, 'tpot') - - _, interpolated_throughput_data = interpolated_throughput_data_by_rps[i] - throughput_dist_ms = Distribution(data=interpolated_throughput_data) - final_throughput_data = generate_metric_report(throughput_dist_ms, 'throughput') - - _, interpolated_latency_data = interpolated_latency_data_by_rps[i] - latency_dist_ms = Distribution(data=[val * 1000 for val in interpolated_latency_data]) - final_latency_data = generate_metric_report(latency_dist_ms, 'timePerRequest') - - benchmark_json.append({ - "requestsPerSecond": rps, - **final_itl_data, - **final_ttft_data, - **final_throughput_data, - **final_latency_data, - }) - return benchmark_json - def generate_benchmarks_json(benchmarks: List[TextGenerationBenchmark]): - raw_benchmark_json = [] + benchmark_json = [] for benchmark in benchmarks: benchmarks_report = generate_benchmark_json(benchmark) - raw_benchmark_json.append(benchmarks_report) - interpolated_benchmark_json = generate_interpolated_benchmarks(benchmarks) + benchmark_json.append(benchmarks_report) - return { "raw": raw_benchmark_json, "interpolatedByRps": interpolated_benchmark_json } + return { "benchmarks": benchmark_json } def generate_js_variable(variable_name: str, data: dict) -> str: json_data = json.dumps(data, indent=2) - return f'`window.{variable_name} = {json_data};`' # Wrap in quotes + return f'window.{variable_name} = {json_data};' def generate_ui_api_data(report: TextGenerationBenchmarkReport): filtered_benchmarks = list(filter(lambda bm: (bm.completed_request_rate > 0) and bm.mode != 'throughput', report.benchmarks)) @@ -241,4 +181,8 @@ def generate_ui_api_data(report: TextGenerationBenchmarkReport): with open("ben_test/benchmarks.js", "w") as f: f.write(benchmarks_script) - print("Reports saved to run_info.json, workload_details.json, benchmarks.json") \ No newline at end of file + return { + "window.run_info = {};": run_info_script, + "window.workload_details = {};": workload_details_script, + "window.benchmarks = {};": benchmarks_script, + } \ No newline at end of file diff --git a/src/guidellm/utils/injector.py b/src/guidellm/utils/injector.py index fb5216aa..21e20901 100644 --- a/src/guidellm/utils/injector.py +++ b/src/guidellm/utils/injector.py @@ -1,20 +1,18 @@ from pathlib import Path from typing import Union -from pydantic import BaseModel - from guidellm.config import settings from guidellm.utils.text import load_text __all__ = ["create_report", "inject_data"] -def create_report(model: BaseModel, output_path: Union[str, Path]) -> Path: +def create_report(js_data: dict, output_path: Union[str, Path]) -> Path: """ - Creates a report from the model and saves it to the output path. + Creates a report from the dictionary and saves it to the output path. - :param model: the model to serialize and inject - :type model: BaseModel + :param js_data: dict with match str and json data to inject + :type js_data: dict :param output_path: the path, either a file or a directory, to save the report to. If a directory, the report will be saved as "report.html" inside of the directory. @@ -27,10 +25,8 @@ def create_report(model: BaseModel, output_path: Union[str, Path]) -> Path: html_content = load_text(settings.report_generation.source) report_content = inject_data( - model, + js_data, html_content, - settings.report_generation.report_html_match, - settings.report_generation.report_html_placeholder, ) if not output_path.suffix: @@ -39,32 +35,23 @@ def create_report(model: BaseModel, output_path: Union[str, Path]) -> Path: output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(report_content) - + print(f'Report saved to {output_path}') return output_path - def inject_data( - model: BaseModel, + js_data: dict, html: str, - match: str, - placeholder: str, ) -> str: """ - Injects the data from the model into the HTML while replacing the placeholder. + Injects the json data into the HTML while replacing the placeholder. - :param model: the model to serialize and inject - :type model: BaseModel + :param js_data: the json data to inject + :type js_data: dict :param html: the html to inject the data into :type html: str - :param match: the string to match in the html to find the placeholder - :type match: str - :param placeholder: the placeholder to replace with the model data - inside of the placeholder - :type placeholder: str - :return: the html with the model data injected + :return: the html with the json data injected :rtype: str """ - model_str = model.json() - inject_str = match.replace(placeholder, model_str) - - return html.replace(match, inject_str) + for placeholder, script in js_data.items(): + html = html.replace(placeholder, script) + return html \ No newline at end of file diff --git a/src/guidellm/utils/interpolation.py b/src/guidellm/utils/interpolation.py deleted file mode 100644 index ef19d8e4..00000000 --- a/src/guidellm/utils/interpolation.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import List, Tuple -import numpy as np - -def linear_interpolate(target: float, lower: Tuple[float, float], upper: Tuple[float, float]) -> float: - """ - Linearly interpolates a value at 'target' given two points. - If the target equals one of the bounds, the corresponding value is returned. - """ - lower_ref, lower_measurement = lower - upper_ref, upper_measurement = upper - - if upper_ref == lower_ref: - return lower_measurement - if target <= lower_ref: - return lower_measurement - if target >= upper_ref: - return upper_measurement - - t = (target - lower_ref) / (upper_ref - lower_ref) - return lower_measurement + t * (upper_measurement - lower_measurement) - -def stretch_list(arr: List[float], target_length: int): - if len(arr) == target_length: - return np.array(arr) - - original_x = np.linspace(0, 1, len(arr)) - target_x = np.linspace(0, 1, target_length) - stretched_arr = list(np.interp(target_x, original_x, arr)) - return stretched_arr - -def interpolate_measurements(target: float, lower_ref_measurements_pair: Tuple[float, List[float]], upper_ref_measurements_pair: Tuple[float, List[float]]) -> List[float]: - """ - Interpolates each corresponding measurement value between lower and upper benchmarks. - Assumes that lower_measurements and upper_measurements have the same length. - """ - lower_ref, lower_measurements = lower_ref_measurements_pair - upper_ref, upper_measurements = upper_ref_measurements_pair - - if len(lower_measurements) < len(upper_measurements): - lower_measurements = stretch_list(lower_measurements, len(upper_measurements)) - if len(lower_measurements) > len(upper_measurements): - upper_measurements = stretch_list(upper_measurements, len(lower_measurements)) - - return [ - linear_interpolate(target, (lower_ref, lower_measurements[i]), (upper_ref, upper_measurements[i])) - for i in range(len(lower_measurements)) - ] - -def interpolate_data_points(data_points: List[Tuple[float, List[float]]], - target_ref: List[float]) -> List[Tuple[float, List[float]]]: - """ - Given sorted data_points as tuples of (scalar, measurements) and a list of target scalar values, - interpolate the measurements for each target. - - The data_points must be sorted by the scalar value in ascending order. - Only target scalar values that fall within the min and max of the data_points are considered. - """ - if not data_points: - return [] - - lower_bound = data_points[0][0] - upper_bound = data_points[-1][0] - # Filter target_ref to only include values within the provided range. - valid_targets = [t for t in target_ref if lower_bound <= t <= upper_bound] - - interpolated_results = [] - # Pointer to the current lower data point index. - lower_idx = 0 - - for target in sorted(valid_targets): - # Advance the lower_idx until we find the correct interval. - while (lower_idx < len(data_points) - 1 and target > data_points[lower_idx + 1][0]): - lower_idx += 1 - - # If the target exactly matches a known scalar value, use its measurements. - if target == data_points[lower_idx][0]: - interpolated_results.append((target, data_points[lower_idx][1][:])) - # Otherwise, if target lies between two data points, interpolate. - elif lower_idx < len(data_points) - 1: - lower_ref, lower_measurements = data_points[lower_idx] - upper_ref, upper_measurements = data_points[lower_idx + 1] - interpolated = interpolate_measurements(target, (lower_ref, lower_measurements), - (upper_ref, upper_measurements)) - interpolated_results.append((target, interpolated)) - else: - # If for some reason target is above the highest known data point, ignore it. - continue - - return interpolated_results \ No newline at end of file diff --git a/tests/unit/utils/test_interpolation.py b/tests/unit/utils/test_interpolation.py deleted file mode 100644 index a79504b5..00000000 --- a/tests/unit/utils/test_interpolation.py +++ /dev/null @@ -1,26 +0,0 @@ -import pytest -from guidellm.utils.interpolation import ( - linear_interpolate, - interpolate_measurements, - interpolate_data_points, - stretch_list -) - -@pytest.mark.smoke() -def test_linear_interpolate(): - assert linear_interpolate(2, (1, 4), (3, 6)) == 5 - - -@pytest.mark.smoke() -def test_stretch_list(): - assert stretch_list([1,3,5], 5) == [1,2,3,4,5] - -@pytest.mark.smoke() -def test_interpolate_measurements(): - assert interpolate_measurements(2, (1, [1,2,3,4,5]), (3, [2,3,4,5,6])) == [1.5,2.5,3.5,4.5,5.5] - assert interpolate_measurements(2, (1, [1,2,3,4,5]), (3, [5,4,3,2,1])) == [3,3,3,3,3] - - -@pytest.mark.smoke() -def test_interpolate_data_point(): - assert interpolate_data_points([(1, [1,2,3,4,5]), (3, [2,3,4,5,6]), (9, [5,6,7,8,9])], [1,2,3,4,5,6,7,8,9]) == [(1, [1,2,3,4,5]), (2, [1.5,2.5,3.5,4.5,5.5]), (3, [2,3,4,5,6]), (4, [2.5, 3.5, 4.5, 5.5, 6.5]), (5, [3, 4, 5, 6, 7]), (6, [3.5, 4.5, 5.5, 6.5, 7.5]), (7, [4, 5, 6, 7, 8]), (8, [4.5, 5.5, 6.5, 7.5, 8.5]), (9, [5, 6, 7, 8, 9])] \ No newline at end of file