Skip to content

Commit c4258f4

Browse files
authored
Merge pull request ROCm#581 from ROCm/upstream_merge_2025_06_23
Upstream merge 2025 06 23
2 parents c786fce + 324b18a commit c4258f4

File tree

278 files changed

+13248
-4951
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

278 files changed

+13248
-4951
lines changed

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
5454
--name "${container_name}" \
5555
${image_name} \
5656
/bin/bash -c "
57+
set -e; # Exit on first error
5758
python3 /workspace/vllm/examples/offline_inference/neuron.py;
5859
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
5960
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
60-
echo 'Running test file: '$f;
61+
echo \"Running test file: \$f\";
6162
python3 -m pytest \$f -v --capture=tee-sys;
6263
done
6364
"

.buildkite/test-pipeline.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ steps:
8989
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
9090

9191
- label: Chunked Prefill Test
92-
mirror_hardwares: [amdexperimental]
92+
mirror_hardwares: [amdexperimental, amdproduction]
9393
source_file_dependencies:
9494
- vllm/
9595
- tests/basic_correctness/test_chunked_prefill
@@ -271,6 +271,15 @@ steps:
271271
commands:
272272
- pytest -v -s prefix_caching
273273

274+
275+
- label: Platform Tests (CUDA)
276+
mirror_hardwares: [amdexperimental]
277+
source_file_dependencies:
278+
- vllm/
279+
- tests/cuda
280+
commands:
281+
- pytest -v -s cuda/test_cuda_context.py
282+
274283
- label: Samplers Test # 36min
275284
mirror_hardwares: [amdexperimental]
276285
source_file_dependencies:

.github/mergify.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ pull_request_rules:
4545
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
4646
- files~=^vllm/model_executor/models/.*llama.*\.py
4747
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
48+
- title~=(?i)llama
4849
actions:
4950
label:
5051
add:
@@ -65,6 +66,33 @@ pull_request_rules:
6566
add:
6667
- multi-modality
6768

69+
- name: label-performance
70+
description: Automatically apply performance label
71+
conditions:
72+
- or:
73+
- files~=^benchmarks/
74+
- files~=^vllm/benchmarks/
75+
- files~=^tests/benchmarks/
76+
- files~=^\.buildkite/nightly-benchmarks/
77+
actions:
78+
label:
79+
add:
80+
- performance
81+
82+
- name: label-qwen
83+
description: Automatically apply qwen label
84+
conditions:
85+
- or:
86+
- files~=^examples/.*qwen.*\.py
87+
- files~=^tests/.*qwen.*\.py
88+
- files~=^vllm/model_executor/models/.*qwen.*\.py
89+
- files~=^vllm/reasoning/.*qwen.*\.py
90+
- title~=(?i)Qwen
91+
actions:
92+
label:
93+
add:
94+
- qwen
95+
6896
- name: label-rocm
6997
description: Automatically apply rocm label
7098
conditions:

.pre-commit-config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,11 @@ repos:
115115
entry: python tools/check_spdx_header.py
116116
language: python
117117
types: [python]
118+
- id: check-root-lazy-imports
119+
name: Check root lazy imports
120+
entry: python tools/check_init_lazy_imports.py
121+
language: python
122+
types: [python]
118123
- id: check-filenames
119124
name: Check for spaces in all filenames
120125
entry: bash

CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -420,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
420420
endif()
421421
endif()
422422

423-
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
424-
# CUDA 12.8 or later
425-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
423+
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
424+
# require CUDA 12.8 or later
425+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
426426
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
427427
set(SRCS
428428
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,13 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
154154

155155
## Contact Us
156156

157+
<!-- --8<-- [start:contact-us] -->
157158
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
158159
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
159-
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
160+
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
160161
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
161162
- For collaborations and partnerships, please contact us at [[email protected]](mailto:[email protected])
163+
<!-- --8<-- [end:contact-us] -->
162164

163165
## Media Kit
164166

benchmarks/backend_request_func.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,14 @@ async def async_request_openai_chat_completions(
404404
chunk_bytes = chunk_bytes.strip()
405405
if not chunk_bytes:
406406
continue
407+
chunk_bytes = chunk_bytes.decode("utf-8")
408+
# NOTE: SSE comments (often used as pings) start with a colon.
409+
# These are not JSON data payload and should be skipped.
410+
if chunk_bytes.startswith(":"):
411+
continue
412+
413+
chunk = chunk_bytes.removeprefix("data: ")
407414

408-
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
409415
if chunk != "[DONE]":
410416
timestamp = time.perf_counter()
411417
data = json.loads(chunk)

benchmarks/benchmark_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ def sample(
353353
: input_lens[i]
354354
]
355355
prompt = tokenizer.decode(re_encoded_sequence)
356-
total_input_len = prefix_len + int(input_lens[i])
356+
total_input_len = len(re_encoded_sequence)
357357
requests.append(
358358
SampleRequest(
359359
prompt=prompt,

benchmarks/benchmark_throughput.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def run_vllm(
9797
assert lora_requests is None, "BeamSearch API does not support LoRA"
9898
prompts = [request.prompt for request in requests]
9999
# output_len should be the same for all requests.
100-
output_len = requests[0][2]
100+
output_len = requests[0].expected_output_len
101101
for request in requests:
102102
assert request.expected_output_len == output_len
103103
start = time.perf_counter()
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import argparse
4+
import itertools
5+
6+
import torch
7+
8+
from vllm import _custom_ops as ops
9+
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
10+
moe_align_block_size_triton,
11+
)
12+
from vllm.triton_utils import triton
13+
14+
15+
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
16+
return torch.stack(
17+
[
18+
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
19+
for _ in range(num_tokens)
20+
]
21+
)
22+
23+
24+
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
25+
"""
26+
Verifies vllm vs. Triton
27+
"""
28+
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
29+
30+
# 1. malloc space for triton and vllm
31+
# malloc enough space (max_num_tokens_padded) for the sorted ids
32+
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
33+
sorted_ids_triton = torch.empty(
34+
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
35+
)
36+
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
37+
expert_ids_triton = torch.zeros(
38+
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
39+
)
40+
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
41+
42+
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
43+
sorted_ids_vllm.fill_(topk_ids.numel())
44+
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
45+
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
46+
47+
# 2. run implementations
48+
moe_align_block_size_triton(
49+
topk_ids,
50+
num_experts,
51+
block_size,
52+
sorted_ids_triton,
53+
expert_ids_triton,
54+
num_tokens_post_pad_triton,
55+
)
56+
57+
ops.moe_align_block_size(
58+
topk_ids,
59+
num_experts,
60+
block_size,
61+
sorted_ids_vllm,
62+
expert_ids_vllm,
63+
num_tokens_post_pad_vllm,
64+
)
65+
print(f"✅ VLLM implementation works with {num_experts} experts!")
66+
67+
# 3. compare results
68+
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
69+
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
70+
):
71+
print("✅ Triton and VLLM implementations match.")
72+
else:
73+
print("❌ Triton and VLLM implementations DO NOT match.")
74+
print("Triton expert_ids:", expert_ids_triton)
75+
print("VLLM expert_ids:", expert_ids_vllm)
76+
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
77+
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
78+
79+
80+
# test configurations
81+
num_tokens_range = [1, 16, 256, 4096]
82+
num_experts_range = [16, 64, 224, 256, 280, 512]
83+
topk_range = [1, 2, 8]
84+
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
85+
86+
87+
@triton.testing.perf_report(
88+
triton.testing.Benchmark(
89+
x_names=["num_tokens", "num_experts", "topk"],
90+
x_vals=configs,
91+
line_arg="provider",
92+
line_vals=["vllm", "triton"], # "triton"
93+
line_names=["VLLM", "Triton"], # "Triton"
94+
plot_name="moe-align-block-size-performance",
95+
args={},
96+
)
97+
)
98+
def benchmark(num_tokens, num_experts, topk, provider):
99+
"""Benchmark function for Triton."""
100+
block_size = 256
101+
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
102+
103+
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
104+
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
105+
sorted_ids.fill_(topk_ids.numel())
106+
max_num_m_blocks = max_num_tokens_padded // block_size
107+
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
108+
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
109+
110+
quantiles = [0.5, 0.2, 0.8]
111+
112+
if provider == "vllm":
113+
ms, min_ms, max_ms = triton.testing.do_bench(
114+
lambda: ops.moe_align_block_size(
115+
topk_ids,
116+
num_experts,
117+
block_size,
118+
sorted_ids.clone(),
119+
expert_ids.clone(),
120+
num_tokens_post_pad.clone(),
121+
),
122+
quantiles=quantiles,
123+
)
124+
elif provider == "triton":
125+
ms, min_ms, max_ms = triton.testing.do_bench(
126+
lambda: moe_align_block_size_triton(
127+
topk_ids,
128+
num_experts,
129+
block_size,
130+
sorted_ids.clone(),
131+
expert_ids.clone(),
132+
num_tokens_post_pad.clone(),
133+
),
134+
quantiles=quantiles,
135+
)
136+
137+
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
138+
139+
140+
if __name__ == "__main__":
141+
parser = argparse.ArgumentParser()
142+
parser.add_argument(
143+
"--num_experts",
144+
type=int,
145+
default=64,
146+
choices=[8, 16, 32, 64, 128, 256],
147+
)
148+
parser.add_argument(
149+
"--topk",
150+
type=int,
151+
default=8,
152+
choices=[2, 4, 8],
153+
help="Top-k value for correctness check.",
154+
)
155+
args = parser.parse_args()
156+
157+
print("Running correctness check...")
158+
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
159+
benchmark.run(print_data=True, show_plots=True)

0 commit comments

Comments
 (0)