From 72306f2c3a4a1f8fa3880f2d69ce241f71d751da Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <fadi.arafeh@arm.com>
Date: Tue, 16 Dec 2025 12:30:41 +0000
Subject: [PATCH] Re-enable Arm CPU vLLM HUD Benchmarks

- re-enables the benchmarks disabled in #114
- related to https://github.com/vllm-project/vllm/pull/26494 (not sure which needs to go in first)
- use default block_size in serving benchmarks (i.e. 128 instead of setting it to 16)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../scripts/generate_vllm_benchmark_matrix.py |   6 +-
 .../test_generate_vllm_benchmark_matrix.py    |  18 +-
 .github/workflows/vllm-benchmark.yml          |  14 +-
 .../arm64-cpu/latency-tests-arm64-cpu.json    |  18 +-
 .../arm64-cpu/serving-tests-arm64-cpu.json    |  96 +--------
 .../arm64-cpu/throughput-tests-arm64-cpu.json |  20 +-
 .../benchmarks/cuda/throughput-tests.json     | 197 +-----------------
 7 files changed, 35 insertions(+), 334 deletions(-)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index 194b2ed0..599ad9c8 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -17,8 +17,8 @@
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
-        # "linux.24xl.gnr",  # TODO (huydhn): The runner is down (Jan 7th 2026)
-        # "linux.arm64.m7g.4xlarge",  # TODO (huydhn): This is not working yet
+        # "linux.24xl.gnr", # TODO (huydhn): The runner is down (Jan 7th 2026)
+        "linux.arm64.m8g.4xlarge",
         "linux.dgx.b200",
         "linux.hpu.gaudi3.8",
     ],
@@ -59,7 +59,7 @@
     "linux.rocm.gpu.gfx942.8": "rocm",
     "linux.24xl.spr-metal": "cpu",
     "linux.24xl.gnr": "cpu",
-    # "linux.arm64.m7g.4xlarge": "arm64-cpu",  # TODO (huydhn): This is not working yet
+    "linux.arm64.m8g.4xlarge": "arm64-cpu",
     "linux.hpu.gaudi3.8": "hpu",
 }
 
diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py
index 6e5d6499..92360389 100644
--- a/.github/scripts/test_generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/test_generate_vllm_benchmark_matrix.py
@@ -22,7 +22,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -209,7 +209,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -247,7 +247,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -286,7 +286,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -321,7 +321,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -409,7 +409,7 @@ def test_generate_benchmark_matrix():
 
     # Select multiple runners
     models = []
-    runners = ["h100", "spr", "m7g"]
+    runners = ["h100", "spr", "m8g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -419,7 +419,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -624,7 +624,7 @@ def test_generate_benchmark_matrix():
         "meta-llama/meta-llama-3.1-8b-instruct",
         "mistralai/mixtral-8x7b-instruct-v0.1",
     ]
-    runners = ["rocm", "spr", "m7g"]
+    runners = ["rocm", "spr", "m8g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -634,7 +634,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
index 8cf8a13d..ef9cdc1c 100644
--- a/.github/workflows/vllm-benchmark.yml
+++ b/.github/workflows/vllm-benchmark.yml
@@ -25,10 +25,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        # TODO (huydhn): Remove aarch64 CPU benchmark running on m7g until the change
-        # from https://github.com/vllm-project/vllm/pull/26494#issuecomment-3537415441
-        # is resolved and merged
-        default: h100,rocm,spr,gnr,b200,gaudi3
+        default: h100,rocm,spr,gnr,b200,gaudi3,m8g
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@@ -303,13 +300,11 @@ jobs:
         run: |
           set -eux
 
-          ON_ARM64_CPU=0
           ON_CPU=0
 
-          case "$DEVICE_NAME" in
-            cpu)       ON_CPU=1 ;;
-            arm64-cpu) ON_ARM64_CPU=1 ;;
-          esac
+          if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+            ON_CPU=1
+          fi
 
           container_name=$(docker run \
             ${GPU_FLAG:-} \
@@ -322,7 +317,6 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
-            -e ON_ARM64_CPU="${ON_ARM64_CPU}" \
             --ipc=host \
             --tty \
             --detach \
diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json
index 4c592863..2ac07aa9 100644
--- a/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json
+++ b/vllm-benchmarks/benchmarks/arm64-cpu/latency-tests-arm64-cpu.json
@@ -3,7 +3,7 @@
         "test_name": "latency_llama8B_tp1",
         "environment_variables": {
             "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_CPU_KVCACHE_SPACE": 40
+            "VLLM_CPU_KVCACHE_SPACE": 20
         },
         "parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -12,19 +12,5 @@
             "num_iters_warmup": 5,
             "num_iters": 15
         }
-    },
-    {
-        "test_name": "latency_llama8B_tp4",
-        "environment_variables": {
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
     }
-]
+]
\ No newline at end of file
diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json
index 5474179f..0f07860e 100644
--- a/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json
+++ b/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json
@@ -1,107 +1,19 @@
 [
     {
-        "test_name": "serving_llama8B_tp1_sharegpt",
+        "test_name": "serving_llama8B_tp1_random_1024_128",
         "qps_list": [1, 4, 16, "inf"],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": 3000,
             "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
             "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
+            "VLLM_CPU_KVCACHE_SPACE": 20
         },
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
-            "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
-            "trust_remote_code": "",
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-            "device": "cpu",
-            "dtype": "bfloat16",
-            "distributed_executor_backend": "mp",
-            "block_size": 16,
-            "trust_remote_code": "",
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "device": "cpu",
-            "dtype": "bfloat16",
-            "distributed_executor_backend": "mp",
-            "block_size": 16,
-            "trust_remote_code": "",
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_random_1024_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "device": "cpu",
-            "dtype": "bfloat16",
-            "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "enable_chunked_prefill": "",
             "disable_log_stats": "",
@@ -115,7 +27,7 @@
             "random-input-len": 1024,
             "random-output-len": 128,
             "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 64
         }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json
index 257786ca..41213479 100644
--- a/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json
+++ b/vllm-benchmarks/benchmarks/arm64-cpu/throughput-tests-arm64-cpu.json
@@ -8,24 +8,12 @@
         "parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
+            "input-len": 1024,
+            "output-len": 128,
+            "max-model-len": 1152,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama8B_tp4",
-        "environment_variables": {
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
+            "num_prompts": 64,
             "backend": "vllm"
         }
     }
diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
index d6c4a91a..e4453088 100644
--- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
+++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
@@ -1,199 +1,20 @@
 [
     {
         "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_CPU_KVCACHE_SPACE": 20
+        },
         "parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
+            "input-len": 1024,
+            "output-len": 128,
+            "max-model-len": 1152,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama70B_tp4",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_mixtral8x7B_tp2",
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama4_scout_tp4",
-        "parameters": {
-            "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_llama4_maverick_fp8_tp8",
-        "parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_gpt_oss_20b_tp1",
-        "parameters": {
-            "model": "openai/gpt-oss-20b",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_gpt_oss_120b_tp4",
-        "parameters": {
-            "model": "openai/gpt-oss-120b",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_opt125m_tp1",
-        "parameters": {
-            "model": "facebook/opt-125m",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 2048
-        }
-    },
-    {
-        "test_name": "throughput_deepseek_v3_1_tp8",
-        "parameters": {
-            "model": "deepseek-ai/DeepSeek-V3.1",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_deepseek_v3_2_tp8",
-        "parameters": {
-            "model": "deepseek-ai/DeepSeek-V3.2",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_deepseek_r1_tp8",
-        "parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_gemma_3_27b_it_tp8",
-        "parameters": {
-            "model": "google/gemma-3-27b-it",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_qwen3_30b_a3b_tp8",
-        "parameters": {
-            "model": "Qwen/Qwen3-30B-A3B",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm",
-            "max_model_len": 8192
-        }
-    },
-    {
-        "test_name": "throughput_gemma3_12b_it_fp8_torchao",
-        "parameters": {
-            "model": "pytorch/gemma-3-12b-it-FP8",
-            "quantization": "torchao",
-            "load_format": "auto",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_gemma3_12b_it_int4_torchao",
-        "parameters": {
-            "model": "pytorch/gemma-3-12b-it-INT4",
-            "quantization": "torchao",
-            "load_format": "auto",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_gemma3_27b_it_fp8_torchao",
-        "parameters": {
-            "model": "pytorch/gemma-3-27b-it-FP8",
-            "quantization": "torchao",
-            "load_format": "auto",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_gemma3_27b_it_int4_torchao",
-        "parameters": {
-            "model": "pytorch/gemma-3-27b-it-INT4",
-            "quantization": "torchao",
-            "load_format": "auto",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
+            "num_prompts": 64,
             "backend": "vllm"
         }
     }
-]
+]
\ No newline at end of file