neuralmagic
diff --git a/‎.buildkite/nightly-benchmarks/scripts/compare-json-results.py‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/scripts/compare-json-results.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 16 additions & 16 deletions b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎.buildkite/scripts/upload-wheels.sh‎
Lines changed: 12 additions & 10 deletions b/‎.buildkite/scripts/upload-wheels.sh‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/README.md‎
Lines changed: 6 additions & 1 deletion b/‎benchmarks/README.md‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎benchmarks/benchmark_block_pool.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_block_pool.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_ngram_proposer.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_ngram_proposer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_serving.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_serving.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_throughput.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_throughput.py‎
Lines changed: 1 addition & 1 deletion
@@ -218,7 +218,7 @@ def split_json_by_tp_pp(
         "--xaxis",
         type=str,
         default="# of max concurrency.",
-        help="column name to use as X Axis in comparision graph",
+        help="column name to use as X Axis in comparison graph",
     )
     args = parser.parse_args()
 
 
@@ -1,21 +1,24 @@
 steps:
-  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.8"
-    id: build-wheel-arm64-cuda-12-8
+  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  - label: "Build arm64 wheel - CUDA 12.9"
+    id: build-wheel-arm64-cuda-12-9
     agents:
       queue: arm64_cpu_queue_postmerge
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  # x86 + CUDA builds
+  - block: "Build CUDA 12.8 wheel"
+    key: block-build-cu128-wheel
+
   - label: "Build wheel - CUDA 12.8"
+    depends_on: block-build-cu128-wheel
     id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
@@ -44,18 +47,14 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-  # However, this block can be uncommented to save some compute hours.
-  # - block: "Build CUDA 11.8 wheel"
-  #   key: block-build-cu118-wheel
-
-  - label: "Build wheel - CUDA 11.8"
-    # depends_on: block-build-cu118-wheel
-    id: build-wheel-cuda-11-8
+  # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-cuda-12-9
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -75,14 +74,15 @@ steps:
       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
   - label: "Build release image (arm64)"
     depends_on: ~
     id: build-release-image-arm64
     agents:
       queue: arm64_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
   # Add job to create multi-arch manifest
@@ -103,7 +103,7 @@ steps:
       - create-multi-arch-manifest
       - build-wheel-cuda-12-8
       - build-wheel-cuda-12-6
-      - build-wheel-cuda-11-8
+      - build-wheel-cuda-12-9
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
 
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
     # if $normal_wheel matches cu126, do not upload the index.html
     echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -74,14 +75,15 @@ fi
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
     # if $normal_wheel matches cu126, do not upload the index.html
     echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 
 
@@ -666,7 +666,7 @@ steps:
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+    # - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -676,7 +676,7 @@ steps:
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -110,7 +110,12 @@ become available.
 
 🚧: to be supported
 
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```
 
 ## 🚀 Example - Online Benchmark
 
 
@@ -57,7 +57,7 @@ def invoke_main() -> None:
         "--num-iteration",
         type=int,
         default=1000,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
     )
     parser.add_argument(
         "--allocate-blocks",
 
@@ -77,7 +77,7 @@ def invoke_main() -> None:
         "--num-iteration",
         type=int,
         default=100,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
     )
     parser.add_argument(
         "--num-req", type=int, default=128, help="Number of requests in the batch"
 
@@ -1104,7 +1104,7 @@ def create_argument_parser():
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'Default value is "ttft,tpot,itl".',
 
@@ -998,7 +998,7 @@ def create_argument_parser():
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'Default value is "ttft,tpot,itl".',
 
@@ -719,7 +719,7 @@ def create_argument_parser():
         "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
 
-    # hf dtaset
+    # hf dataset
     parser.add_argument(
         "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
     )
Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,7 @@ def split_json_by_tp_pp(`
`218`	`218`	`"--xaxis",`
`219`	`219`	`type=str,`
`220`	`220`	`default="# of max concurrency.",`
`221`		`- help="column name to use as X Axis in comparision graph",`
	`221`	`+ help="column name to use as X Axis in comparison graph",`
`222`	`222`	`)`
`223`	`223`	`args = parser.parse_args()`
`224`	`224`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def invoke_main() -> None:`
`57`	`57`	`"--num-iteration",`
`58`	`58`	`type=int,`
`59`	`59`	`default=1000,`
`60`		`- help="Number of iterations to run to stablize final data readings",`
	`60`	`+ help="Number of iterations to run to stabilize final data readings",`
`61`	`61`	`)`
`62`	`62`	`parser.add_argument(`
`63`	`63`	`"--allocate-blocks",`
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ def invoke_main() -> None:`
`77`	`77`	`"--num-iteration",`
`78`	`78`	`type=int,`
`79`	`79`	`default=100,`
`80`		`- help="Number of iterations to run to stablize final data readings",`
	`80`	`+ help="Number of iterations to run to stabilize final data readings",`
`81`	`81`	`)`
`82`	`82`	`parser.add_argument(`
`83`	`83`	`"--num-req", type=int, default=128, help="Number of requests in the batch"`
Original file line number	Diff line number	Diff line change
`@@ -719,7 +719,7 @@ def create_argument_parser():`
`719`	`719`	`"[length * (1 - range_ratio), length * (1 + range_ratio)].",`
`720`	`720`	`)`
`721`	`721`
`722`		`- # hf dtaset`
	`722`	`+ # hf dataset`
`723`	`723`	`parser.add_argument(`
`724`	`724`	`"--hf-subset", type=str, default=None, help="Subset of the HF dataset."`
`725`	`725`	`)`