-
Notifications
You must be signed in to change notification settings - Fork 17
Description
I created this script here which ran for a while:
`#!/usr/bin/env bash
export CUDA_VISIBLE_DEVICES=4,5,6,7
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
llm-optimizer
--framework vllm
--model /opt/models/Llama-3.3-70B-Instruct
--server-args "tensor_parallel_size=4;max_model_len=96000;gpu_memory_utilization=0.92;dtype=auto;kv_cache_dtype=fp8_e5m2;disable_custom_all_reduce=True;max_num_batched_tokens=[16384,32768];max_num_seqs=[8,16]"
--client-args "dataset_name=random;random_input_len=[80000,87000];random_output_len=[4096,8192];max_concurrency=[8,16,24];num_prompts=600"
--constraints 'e2e_latency:p95<3000ms;ttft:median<600ms;itl:p95<20ms'
--output-json vllm_llama33_70b_tp4_ctx96k_long.json`
It generated this JSONL file
{"config": {"client_args": {"dataset_name": "random", "random_input_len": 80000, "random_output_len": 4096, "max_concurrency": 8, "num_prompts": 600}, "server_args": {"tensor_parallel_size": 4, "max_model_len": 96000, "gpu_memory_utilization": 0.92, "dtype": "auto", "kv_cache_dtype": "fp8_e5m2", "disable_custom_all_reduce": true, "max_num_batched_tokens": 16384, "max_num_seqs": 8}, "server_cmd_args": ["--tensor-parallel-size=4", "--max-model-len=96000", "--gpu-memory-utilization=0.92", "--dtype=auto", "--kv-cache-dtype=fp8_e5m2", "--disable-custom-all-reduce", "--max-num-batched-tokens=16384", "--max-num-seqs=8"]}, "results": {"backend": "vllm", "dataset_name": "random", "request_rate": null, "max_concurrency": 8, "sharegpt_output_len": null, "random_input_len": 80000, "random_output_len": 4096, "random_range_ratio": null, "duration": 6751.398178250063, "completed": 599, "total_input_tokens": 23531562, "total_output_tokens": 1260603, "total_output_tokens_retokenized": 1260598, "request_throughput": 0.08872236301062879, "input_throughput": 3485.43536890003, "output_throughput": 186.71732383687427, "mean_e2e_latency_ms": 89713.75561636081, "median_e2e_latency_ms": 87252.22932687029, "std_e2e_latency_ms": 48975.602246598995, "p95_e2e_latency_ms": 164983.1656646915, "p99_e2e_latency_ms": 186898.72136908583, "mean_ttft_ms": 3987.636057498285, "median_ttft_ms": 3711.264762096107, "std_ttft_ms": 2892.3316966101356, "p95_ttft_ms": 8502.071018982679, "p99_ttft_ms": 9888.493707254527, "mean_tpot_ms": 41.1698091101173, "median_tpot_ms": 40.32841876156102, "std_tpot_ms": 13.771030859286075, "p95_tpot_ms": 51.28891339615604, "p99_tpot_ms": 61.49567778480838, "mean_itl_ms": 40.75379246365447, "median_itl_ms": 28.691578889265656, "std_itl_ms": 132.0257121061992, "p95_itl_ms": 30.788091011345383, "p99_itl_ms": 37.04008973203597, "concurrency": 7.9596163928416015, "accept_length": null}, "cmd": "vllm serve /opt/models/Llama-3.3-70B-Instruct --host 127.0.0.1 --port 8000 --tensor-parallel-size=4 --max-model-len=96000 --gpu-memory-utilization=0.92 --dtype=auto --kv-cache-dtype=fp8_e5m2 --disable-custom-all-reduce --max-num-batched-tokens=16384 --max-num-seqs=8", "constraints": [{"op": "<", "value": 3000.0, "name": "p95_e2e_latency_ms"}, {"op": "<", "value": 600.0, "name": "median_ttft_ms"}, {"op": "<", "value": 20.0, "name": "p95_itl_ms"}], "metadata": {"gpu_type": "H100", "gpu_count": 8, "model_tag": "/opt/models/Llama-3.3-70B-Instruct", "input_tokens": 80000, "output_tokens": 4096}} {"config": {"client_args": {"dataset_name": "random", "random_input_len": 80000, "random_output_len": 4096, "max_concurrency": 16, "num_prompts": 600}, "server_args": {"tensor_parallel_size": 4, "max_model_len": 96000, "gpu_memory_utilization": 0.92, "dtype": "auto", "kv_cache_dtype": "fp8_e5m2", "disable_custom_all_reduce": true, "max_num_batched_tokens": 16384, "max_num_seqs": 8}, "server_cmd_args": ["--tensor-parallel-size=4", "--max-model-len=96000", "--gpu-memory-utilization=0.92", "--dtype=auto", "--kv-cache-dtype=fp8_e5m2", "--disable-custom-all-reduce", "--max-num-batched-tokens=16384", "--max-num-seqs=8"]}, "results": {"backend": "vllm", "dataset_name": "random", "request_rate": null, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 80000, "random_output_len": 4096, "random_range_ratio": null, "duration": 6767.213897739071, "completed": 599, "total_input_tokens": 23531562, "total_output_tokens": 1260603, "total_output_tokens_retokenized": 1260599, "request_throughput": 0.08851500913841753, "input_throughput": 3477.2895249937205, "output_throughput": 186.28094501655517, "mean_e2e_latency_ms": 179108.1923766245, "median_e2e_latency_ms": 177338.34646316245, "std_e2e_latency_ms": 54052.65887605943, "p95_e2e_latency_ms": 264816.14120290615, "p99_e2e_latency_ms": 290987.7845663856, "mean_ttft_ms": 93084.54172591522, "median_ttft_ms": 93788.73322624713, "std_ttft_ms": 20162.091320653693, "p95_ttft_ms": 126790.95927150921, "p99_ttft_ms": 144547.8808142617, "mean_tpot_ms": 40.82088372219606, "median_tpot_ms": 40.26403315415724, "std_tpot_ms": 10.271755999733859, "p95_tpot_ms": 51.38441960606455, "p99_tpot_ms": 61.28654271746014, "mean_itl_ms": 40.89523737385885, "median_itl_ms": 28.631073655560613, "std_itl_ms": 133.94720522112567, "p95_itl_ms": 30.64447245560586, "p99_itl_ms": 39.67948021832853, "concurrency": 15.853763284982364, "accept_length": null}, "cmd": "vllm serve /opt/models/Llama-3.3-70B-Instruct --host 127.0.0.1 --port 8000 --tensor-parallel-size=4 --max-model-len=96000 --gpu-memory-utilization=0.92 --dtype=auto --kv-cache-dtype=fp8_e5m2 --disable-custom-all-reduce --max-num-batched-tokens=16384 --max-num-seqs=8", "constraints": [{"op": "<", "value": 3000.0, "name": "p95_e2e_latency_ms"}, {"op": "<", "value": 600.0, "name": "median_ttft_ms"}, {"op": "<", "value": 20.0, "name": "p95_itl_ms"}], "metadata": {"gpu_type": "H100", "gpu_count": 8, "model_tag": "/opt/models/Llama-3.3-70B-Instruct", "input_tokens": 80000, "output_tokens": 4096}} {"config": {"client_args": {"dataset_name": "random", "random_input_len": 80000, "random_output_len": 4096, "max_concurrency": 24, "num_prompts": 600}, "server_args": {"tensor_parallel_size": 4, "max_model_len": 96000, "gpu_memory_utilization": 0.92, "dtype": "auto", "kv_cache_dtype": "fp8_e5m2", "disable_custom_all_reduce": true, "max_num_batched_tokens": 16384, "max_num_seqs": 8}, "server_cmd_args": ["--tensor-parallel-size=4", "--max-model-len=96000", "--gpu-memory-utilization=0.92", "--dtype=auto", "--kv-cache-dtype=fp8_e5m2", "--disable-custom-all-reduce", "--max-num-batched-tokens=16384", "--max-num-seqs=8"]}, "results": {"backend": "vllm", "dataset_name": "random", "request_rate": null, "max_concurrency": 24, "sharegpt_output_len": null, "random_input_len": 80000, "random_output_len": 4096, "random_range_ratio": null, "duration": 6747.998054834083, "completed": 599, "total_input_tokens": 23531562, "total_output_tokens": 1260603, "total_output_tokens_retokenized": 1260599, "request_throughput": 0.08876706767437383, "input_throughput": 3487.191580196534, "output_throughput": 186.8114053614669, "mean_e2e_latency_ms": 266719.65197644714, "median_e2e_latency_ms": 266058.1666762009, "std_e2e_latency_ms": 58493.210966502484, "p95_e2e_latency_ms": 358413.79147511907, "p99_e2e_latency_ms": 390500.66562048154, "mean_ttft_ms": 180930.87402612783, "median_ttft_ms": 180957.03261205927, "std_ttft_ms": 31363.22805180024, "p95_ttft_ms": 228109.66045362875, "p99_ttft_ms": 246372.68500779755, "mean_tpot_ms": 40.37376735685883, "median_tpot_ms": 40.431189570966126, "std_tpot_ms": 6.313308869190464, "p95_tpot_ms": 50.62740799662683, "p99_tpot_ms": 59.17843842601464, "mean_itl_ms": 40.78357977757744, "median_itl_ms": 28.538807295262814, "std_itl_ms": 133.41297560926824, "p95_itl_ms": 30.58791405055672, "p99_itl_ms": 40.86611323058583, "concurrency": 23.67592139707872, "accept_length": null}, "cmd": "vllm serve /opt/models/Llama-3.3-70B-Instruct --host 127.0.0.1 --port 8000 --tensor-parallel-size=4 --max-model-len=96000 --gpu-memory-utilization=0.92 --dtype=auto --kv-cache-dtype=fp8_e5m2 --disable-custom-all-reduce --max-num-batched-tokens=16384 --max-num-seqs=8", "constraints": [{"op": "<", "value": 3000.0, "name": "p95_e2e_latency_ms"}, {"op": "<", "value": 600.0, "name": "median_ttft_ms"}, {"op": "<", "value": 20.0, "name": "p95_itl_ms"}], "metadata": {"gpu_type": "H100", "gpu_count": 8, "model_tag": "/opt/models/Llama-3.3-70B-Instruct", "input_tokens": 80000, "output_tokens": 4096}} {"config": {"client_args": {"dataset_name": "random", "random_input_len": 80000, "random_output_len": 8192, "max_concurrency": 8, "num_prompts": 600}, "server_args": {"tensor_parallel_size": 4, "max_model_len": 96000, "gpu_memory_utilization": 0.92, "dtype": "auto", "kv_cache_dtype": "fp8_e5m2", "disable_custom_all_reduce": true, "max_num_batched_tokens": 16384, "max_num_seqs": 8}, "server_cmd_args": ["--tensor-parallel-size=4", "--max-model-len=96000", "--gpu-memory-utilization=0.92", "--dtype=auto", "--kv-cache-dtype=fp8_e5m2", "--disable-custom-all-reduce", "--max-num-batched-tokens=16384", "--max-num-seqs=8"]}, "results": {"backend": "vllm", "dataset_name": "random", "request_rate": null, "max_concurrency": 8, "sharegpt_output_len": null, "random_input_len": 80000, "random_output_len": 8192, "random_range_ratio": null, "duration": 11338.705406649038, "completed": 599, "total_input_tokens": 23603683, "total_output_tokens": 2506107, "total_output_tokens_retokenized": 2506101, "request_throughput": 0.052827900409930864, "input_throughput": 2081.691176680431, "output_throughput": 221.02232222475894, "mean_e2e_latency_ms": 150722.86968331208, "median_e2e_latency_ms": 152394.92019778118, "std_e2e_latency_ms": 83167.74232719379, "p95_e2e_latency_ms": 279674.48666850105, "p99_e2e_latency_ms": 296699.7111915704, "mean_ttft_ms": 3931.3721871026833, "median_ttft_ms": 3670.4460899345577, "std_ttft_ms": 2758.2607502959845, "p95_ttft_ms": 8427.037762198597, "p99_ttft_ms": 8908.119584014637, "mean_tpot_ms": 35.43254779260766, "median_tpot_ms": 34.71051913640667, "std_tpot_ms": 10.000552394740215, "p95_tpot_ms": 40.988603967133514, "p99_tpot_ms": 49.53327963338663, "mean_itl_ms": 35.09392204002253, "median_itl_ms": 28.954532463103533, "std_itl_ms": 93.87354161646162, "p95_itl_ms": 31.03700012434274, "p99_itl_ms": 32.132303197868175, "concurrency": 7.962372749128996, "accept_length": null}, "cmd": "vllm serve /opt/models/Llama-3.3-70B-Instruct --host 127.0.0.1 --port 8000 --tensor-parallel-size=4 --max-model-len=96000 --gpu-memory-utilization=0.92 --dtype=auto --kv-cache-dtype=fp8_e5m2 --disable-custom-all-reduce --max-num-batched-tokens=16384 --max-num-seqs=8", "constraints": [{"op": "<", "value": 3000.0, "name": "p95_e2e_latency_ms"}, {"op": "<", "value": 600.0, "name": "median_ttft_ms"}, {"op": "<", "value": 20.0, "name": "p95_itl_ms"}], "metadata": {"gpu_type": "H100", "gpu_count": 8, "model_tag": "/opt/models/Llama-3.3-70B-Instruct", "input_tokens": 80000, "output_tokens": 8192}}
and when I run this command
llm-optimizer visualize --data-file vllm_llama33_70b_tp4_ctx96k_long.jsonl --port 8080
I get this error
[ERROR] Failed to generate visualization: Extra data: line 2 column 1 (char 2510) Traceback (most recent call last): File "/home/user/git/LLAMA/.venv/lib/python3.10/site-packages/llm_optimizer/cli.py", line 660, in visualize html_file = optimizer.generate_dashboard(data_file, output_file=output) File "/home/user/git/LLAMA/.venv/lib/python3.10/site-packages/llm_optimizer/visualization/visualize.py", line 256, in generate_dashboard data_dict = self.load_benchmark_data(data_file) File "/home/user/git/LLAMA/.venv/lib/python3.10/site-packages/llm_optimizer/visualization/visualize.py", line 102, in load_benchmark_data file_data = json.load(f) File "/usr/lib/python3.10/json/__init__.py", line 293, in load return loads(fp.read(), File "/usr/lib/python3.10/json/__init__.py", line 346, in loads return _default_decoder.decode(s) File "/usr/lib/python3.10/json/decoder.py", line 340, in decode raise JSONDecodeError("Extra data", s, end) json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 2510)
So as a test I deleted all jsonl lines except the first one and then I get this error isnstead:
[ERROR] Failed to generate visualization: Invalid data format. Expected enhanced format with 'metadata', 'best_configurations', and 'test_results' fields. Traceback (most recent call last): File "/home/user/git/LLAMA/.venv/lib/python3.10/site-packages/llm_optimizer/cli.py", line 660, in visualize html_file = optimizer.generate_dashboard(data_file, output_file=output) File "/home/user/git/LLAMA/.venv/lib/python3.10/site-packages/llm_optimizer/visualization/visualize.py", line 256, in generate_dashboard data_dict = self.load_benchmark_data(data_file) File "/home/user/git/LLAMA/.venv/lib/python3.10/site-packages/llm_optimizer/visualization/visualize.py", line 106, in load_benchmark_data raise ValueError( ValueError: Invalid data format. Expected enhanced format with 'metadata', 'best_configurations', and 'test_results' fields.