From 72306f2c3a4a1f8fa3880f2d69ce241f71d751da Mon Sep 17 00:00:00 2001 From: Fadi Arafeh Date: Tue, 16 Dec 2025 12:30:41 +0000 Subject: [PATCH] Re-enable Arm CPU vLLM HUD Benchmarks - re-enables the benchmarks disabled in #114 - related to https://github.com/vllm-project/vllm/pull/26494 (not sure which needs to go in first) - use default block_size in serving benchmarks (i.e. 128 instead of setting it to 16) Signed-off-by: Fadi Arafeh --- .../scripts/generate_vllm_benchmark_matrix.py | 6 +- .../test_generate_vllm_benchmark_matrix.py | 18 +- .github/workflows/vllm-benchmark.yml | 14 +- .../arm64-cpu/latency-tests-arm64-cpu.json | 18 +- .../arm64-cpu/serving-tests-arm64-cpu.json | 96 +-------- .../arm64-cpu/throughput-tests-arm64-cpu.json | 20 +- .../benchmarks/cuda/throughput-tests.json | 197 +----------------- 7 files changed, 35 insertions(+), 334 deletions(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 194b2ed0..599ad9c8 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -17,8 +17,8 @@ "linux.aws.h100", "linux.rocm.gpu.gfx942.1", "linux.24xl.spr-metal", - # "linux.24xl.gnr", # TODO (huydhn): The runner is down (Jan 7th 2026) - # "linux.arm64.m7g.4xlarge", # TODO (huydhn): This is not working yet + # "linux.24xl.gnr", # TODO (huydhn): The runner is down (Jan 7th 2026) + "linux.arm64.m8g.4xlarge", "linux.dgx.b200", "linux.hpu.gaudi3.8", ], @@ -59,7 +59,7 @@ "linux.rocm.gpu.gfx942.8": "rocm", "linux.24xl.spr-metal": "cpu", "linux.24xl.gnr": "cpu", - # "linux.arm64.m7g.4xlarge": "arm64-cpu", # TODO (huydhn): This is not working yet + "linux.arm64.m8g.4xlarge": "arm64-cpu", "linux.hpu.gaudi3.8": "hpu", } diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py index 6e5d6499..92360389 100644 --- a/.github/scripts/test_generate_vllm_benchmark_matrix.py +++ b/.github/scripts/test_generate_vllm_benchmark_matrix.py @@ -22,7 +22,7 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.arm64.m7g.4xlarge", + "runner": "linux.arm64.m8g.4xlarge", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -209,7 +209,7 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.arm64.m7g.4xlarge", + "runner": "linux.arm64.m8g.4xlarge", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -247,7 +247,7 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.arm64.m7g.4xlarge", + "runner": "linux.arm64.m8g.4xlarge", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -286,7 +286,7 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.arm64.m7g.4xlarge", + "runner": "linux.arm64.m8g.4xlarge", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -321,7 +321,7 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.arm64.m7g.4xlarge", + "runner": "linux.arm64.m8g.4xlarge", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -409,7 +409,7 @@ def test_generate_benchmark_matrix(): # Select multiple runners models = [] - runners = ["h100", "spr", "m7g"] + runners = ["h100", "spr", "m8g"] output = json.dumps( generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 ) @@ -419,7 +419,7 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.arm64.m7g.4xlarge", + "runner": "linux.arm64.m8g.4xlarge", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -624,7 +624,7 @@ def test_generate_benchmark_matrix(): "meta-llama/meta-llama-3.1-8b-instruct", "mistralai/mixtral-8x7b-instruct-v0.1", ] - runners = ["rocm", "spr", "m7g"] + runners = ["rocm", "spr", "m8g"] output = json.dumps( generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 ) @@ -634,7 +634,7 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.arm64.m7g.4xlarge", + "runner": "linux.arm64.m8g.4xlarge", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 8cf8a13d..ef9cdc1c 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -25,10 +25,7 @@ on: A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything) required: true type: string - # TODO (huydhn): Remove aarch64 CPU benchmark running on m7g until the change - # from https://github.com/vllm-project/vllm/pull/26494#issuecomment-3537415441 - # is resolved and merged - default: h100,rocm,spr,gnr,b200,gaudi3 + default: h100,rocm,spr,gnr,b200,gaudi3,m8g concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} @@ -303,13 +300,11 @@ jobs: run: | set -eux - ON_ARM64_CPU=0 ON_CPU=0 - case "$DEVICE_NAME" in - cpu) ON_CPU=1 ;; - arm64-cpu) ON_ARM64_CPU=1 ;; - esac + if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then + ON_CPU=1 + fi container_name=$(docker run \ ${GPU_FLAG:-} \ @@ -322,7 +317,6 @@ jobs: -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ -e ON_CPU="${ON_CPU}" \ - -e ON_ARM64_CPU="${ON_ARM64_CPU}" \ --ipc=host \ --tty \ --detach \ diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json index 4c592863..2ac07aa9 100644 --- a/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json +++ b/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json @@ -3,7 +3,7 @@ "test_name": "latency_llama8B_tp1", "environment_variables": { "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 + "VLLM_CPU_KVCACHE_SPACE": 20 }, "parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", @@ -12,19 +12,5 @@ "num_iters_warmup": 5, "num_iters": 15 } - }, - { - "test_name": "latency_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15 - } } -] +] \ No newline at end of file diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json index 5474179f..0f07860e 100644 --- a/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json +++ b/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json @@ -1,107 +1,19 @@ [ { - "test_name": "serving_llama8B_tp1_sharegpt", + "test_name": "serving_llama8B_tp1_random_1024_128", "qps_list": [1, 4, 16, "inf"], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, + "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": 3000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_KVCACHE_SPACE": 40 + "VLLM_CPU_KVCACHE_SPACE": 20 }, "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, - "device": "cpu", "dtype": "bfloat16", "distributed_executor_backend": "mp", - "block_size": 16, - "trust_remote_code": "", - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_tp2_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "device": "cpu", - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 16, - "trust_remote_code": "", - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "device": "cpu", - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 16, - "trust_remote_code": "", - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_tp4_random_1024_128", - "qps_list": [1, 4, 16, "inf"], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "device": "cpu", - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 16, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", @@ -115,7 +27,7 @@ "random-input-len": 1024, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 100 + "num_prompts": 64 } } ] diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json index 257786ca..41213479 100644 --- a/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json +++ b/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json @@ -8,24 +8,12 @@ "parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, + "input-len": 1024, + "output-len": 128, + "max-model-len": 1152, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, + "num_prompts": 64, "backend": "vllm" } } diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index d6c4a91a..e4453088 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -1,199 +1,20 @@ [ { "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 20 + }, "parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, + "input-len": 1024, + "output-len": 128, + "max-model-len": 1152, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_llama70B_tp4", - "parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_mixtral8x7B_tp2", - "parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tensor_parallel_size": 2, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_llama4_scout_tp4", - "parameters": { - "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_llama4_maverick_fp8_tp8", - "parameters": { - "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", - "tensor_parallel_size": 8, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_gpt_oss_20b_tp1", - "parameters": { - "model": "openai/gpt-oss-20b", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_gpt_oss_120b_tp4", - "parameters": { - "model": "openai/gpt-oss-120b", - "tensor_parallel_size": 4, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_opt125m_tp1", - "parameters": { - "model": "facebook/opt-125m", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 2048 - } - }, - { - "test_name": "throughput_deepseek_v3_1_tp8", - "parameters": { - "model": "deepseek-ai/DeepSeek-V3.1", - "tensor_parallel_size": 8, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_deepseek_v3_2_tp8", - "parameters": { - "model": "deepseek-ai/DeepSeek-V3.2", - "tensor_parallel_size": 8, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_deepseek_r1_tp8", - "parameters": { - "model": "deepseek-ai/DeepSeek-R1", - "tensor_parallel_size": 8, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_gemma_3_27b_it_tp8", - "parameters": { - "model": "google/gemma-3-27b-it", - "tensor_parallel_size": 8, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_qwen3_30b_a3b_tp8", - "parameters": { - "model": "Qwen/Qwen3-30B-A3B", - "tensor_parallel_size": 8, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } - }, - { - "test_name": "throughput_gemma3_12b_it_fp8_torchao", - "parameters": { - "model": "pytorch/gemma-3-12b-it-FP8", - "quantization": "torchao", - "load_format": "auto", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_gemma3_12b_it_int4_torchao", - "parameters": { - "model": "pytorch/gemma-3-12b-it-INT4", - "quantization": "torchao", - "load_format": "auto", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_gemma3_27b_it_fp8_torchao", - "parameters": { - "model": "pytorch/gemma-3-27b-it-FP8", - "quantization": "torchao", - "load_format": "auto", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_gemma3_27b_it_int4_torchao", - "parameters": { - "model": "pytorch/gemma-3-27b-it-INT4", - "quantization": "torchao", - "load_format": "auto", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, + "num_prompts": 64, "backend": "vllm" } } -] +] \ No newline at end of file