Skip to content

Commit 166d0ef

Browse files
authored
Merge pull request ROCm#538 from ROCm/upstream_merge_2025_05_06
Upstream merge 2025 05 06
2 parents 2fea69f + a0b4ef2 commit 166d0ef

File tree

213 files changed

+7786
-4829
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

213 files changed

+7786
-4829
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ steps:
5757
agents:
5858
queue: tpu_queue_postmerge
5959
commands:
60+
- "yes | docker system prune -a"
6061
- "git fetch --all"
6162
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
6263
- "docker push vllm/vllm-tpu:nightly"

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ steps:
3939
- pip install -r ../../requirements/docs.txt
4040
- SPHINXOPTS=\"-W\" make html
4141
# Check API reference (if it fails, you may have missing mock imports)
42-
- grep \"sig sig-object py\" build/html/api/inference_params.html
42+
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
4343

4444
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4545
source_file_dependencies:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ instance/
8080
# Sphinx documentation
8181
docs/_build/
8282
docs/source/getting_started/examples/
83+
docs/source/api/vllm
8384

8485
# PyBuilder
8586
.pybuilder/

.pre-commit-config.yaml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ repos:
4646
rev: 0.6.17
4747
hooks:
4848
- id: pip-compile
49-
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match]
49+
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
5050
files: ^requirements/test\.(in|txt)$
5151
- repo: local
5252
hooks:
@@ -101,8 +101,8 @@ repos:
101101
args:
102102
- -c
103103
- |
104-
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
105-
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
104+
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
105+
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
106106
fi
107107
language: system
108108
verbose: true
@@ -125,8 +125,6 @@ repos:
125125
name: Update Dockerfile dependency graph
126126
entry: tools/update-dockerfile-graph.sh
127127
language: script
128-
files: ^docker/Dockerfile$
129-
pass_filenames: false
130128
# Keep `suggestion` last
131129
- id: suggestion
132130
name: Suggestion

CMakeLists.txt

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)
1515

1616
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
1717
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
18-
1918
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
2019
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
2120

@@ -251,9 +250,8 @@ set(VLLM_EXT_SRC
251250
if(VLLM_GPU_LANG STREQUAL "CUDA")
252251
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
253252

254-
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
255-
# Please keep this in sync with FetchContent_Declare line below.
256-
set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
253+
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
254+
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
257255

258256
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
259257
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -271,7 +269,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
271269
cutlass
272270
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
273271
# Please keep this in sync with CUTLASS_REVISION line above.
274-
GIT_TAG v3.9.0
272+
GIT_TAG ${CUTLASS_REVISION}
275273
GIT_PROGRESS TRUE
276274

277275
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -304,8 +302,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
304302
# are not supported by Machete yet.
305303
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
306304
if (MARLIN_ARCHS)
305+
306+
#
307+
# For the Marlin kernels we automatically generate sources for various
308+
# preselected input type pairs and schedules.
309+
# Generate sources:
310+
set(MARLIN_GEN_SCRIPT
311+
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
312+
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
313+
314+
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
315+
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
316+
317+
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
318+
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
319+
execute_process(
320+
COMMAND ${CMAKE_COMMAND} -E env
321+
PYTHONPATH=$PYTHONPATH
322+
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
323+
RESULT_VARIABLE marlin_generation_result
324+
OUTPUT_VARIABLE marlin_generation_result
325+
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
326+
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
327+
)
328+
329+
if (NOT marlin_generation_result EQUAL 0)
330+
message(FATAL_ERROR "Marlin generation failed."
331+
" Result: \"${marlin_generation_result}\""
332+
"\nCheck the log for details: "
333+
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
334+
else()
335+
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
336+
CACHE STRING "Last run Marlin generate script hash" FORCE)
337+
message(STATUS "Marlin generation completed successfully.")
338+
endif()
339+
else()
340+
message(STATUS "Marlin generation script has not changed, skipping generation.")
341+
endif()
342+
343+
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
344+
set_gencode_flags_for_srcs(
345+
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
346+
CUDA_ARCHS "${MARLIN_ARCHS}")
347+
348+
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
349+
307350
set(MARLIN_SRCS
308-
"csrc/quantization/fp8/fp8_marlin.cu"
309351
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
310352
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
311353
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@@ -647,7 +689,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
647689
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
648690
execute_process(
649691
COMMAND ${CMAKE_COMMAND} -E env
650-
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
692+
PYTHONPATH=$PYTHONPATH
651693
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
652694
RESULT_VARIABLE moe_marlin_generation_result
653695
OUTPUT_VARIABLE moe_marlin_generation_output
@@ -683,6 +725,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
683725
endif()
684726
endif()
685727

728+
if(VLLM_GPU_LANG STREQUAL "CUDA")
729+
set(MOE_PERMUTE_SRC
730+
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
731+
"csrc/moe/moe_permute_unpermute_op.cu")
732+
733+
set_gencode_flags_for_srcs(
734+
SRCS "${MARLIN_PERMUTE_SRC}"
735+
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
736+
737+
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
738+
endif()
686739
message(STATUS "Enabling moe extension.")
687740
define_gpu_extension_target(
688741
_moe_C
@@ -691,6 +744,8 @@ define_gpu_extension_target(
691744
SOURCES ${VLLM_MOE_EXT_SRC}
692745
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
693746
ARCHITECTURES ${VLLM_GPU_ARCHES}
747+
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
748+
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
694749
USE_SABI 3
695750
WITH_SOABI)
696751

benchmarks/benchmark_dataset.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,13 +315,15 @@ def sample(
315315
)
316316

317317
vocab_size = tokenizer.vocab_size
318+
num_special_tokens = tokenizer.num_special_tokens_to_add()
319+
real_input_len = input_len - num_special_tokens
318320

319321
prefix_token_ids = (np.random.randint(
320322
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
321323

322324
# New sampling logic: [X * (1 - b), X * (1 + b)]
323-
input_low = int(input_len * (1 - range_ratio))
324-
input_high = int(input_len * (1 + range_ratio))
325+
input_low = int(real_input_len * (1 - range_ratio))
326+
input_high = int(real_input_len * (1 + range_ratio))
325327
output_low = int(output_len * (1 - range_ratio))
326328
output_high = int(output_len * (1 + range_ratio))
327329

@@ -344,6 +346,17 @@ def sample(
344346
vocab_size).tolist()
345347
token_sequence = prefix_token_ids + inner_seq
346348
prompt = tokenizer.decode(token_sequence)
349+
# After decoding the prompt we have to encode and decode it again.
350+
# This is done because in some cases N consecutive tokens
351+
# give a string tokenized into != N number of tokens.
352+
# For example for GPT2Tokenizer:
353+
# [6880, 6881] -> ['Ġcalls', 'here'] ->
354+
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
355+
# To avoid uncontrolled change of the prompt length,
356+
# the encoded sequence is truncated before being decode again.
357+
re_encoded_sequence = tokenizer.encode(
358+
prompt, add_special_tokens=False)[:input_lens[i]]
359+
prompt = tokenizer.decode(re_encoded_sequence)
347360
total_input_len = prefix_len + int(input_lens[i])
348361
requests.append(
349362
SampleRequest(

benchmarks/benchmark_serving_structured_output.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,6 @@ async def benchmark(
414414
ignore_eos: bool,
415415
max_concurrency: Optional[int],
416416
structured_output_ratio: float,
417-
structured_output_backend: str,
418417
goodput_config_dict: Optional[dict[str, float]] = None,
419418
):
420419
if backend in ASYNC_REQUEST_FUNCS:
@@ -426,8 +425,6 @@ def prepare_extra_body(request) -> dict:
426425
extra_body = {}
427426
# Add the schema to the extra_body
428427
extra_body[request.structure_type] = request.schema
429-
# Add the specific structured_output_backend
430-
extra_body["guided_decoding_backend"] = structured_output_backend
431428
return extra_body
432429

433430
print("Starting initial single prompt test run...")
@@ -785,7 +782,6 @@ def main(args: argparse.Namespace):
785782
ignore_eos=args.ignore_eos,
786783
max_concurrency=args.max_concurrency,
787784
structured_output_ratio=args.structured_output_ratio,
788-
structured_output_backend=args.structured_output_backend,
789785
goodput_config_dict=goodput_config_dict,
790786
))
791787

@@ -1000,14 +996,6 @@ def main(args: argparse.Namespace):
1000996
type=float,
1001997
default=1.0,
1002998
help="Ratio of Structured Outputs requests")
1003-
parser.add_argument("--structured-output-backend",
1004-
type=str,
1005-
choices=[
1006-
"outlines", "lm-format-enforcer", "xgrammar",
1007-
"guidance", "auto"
1008-
],
1009-
default="auto",
1010-
help="Backend to use for structured outputs")
1011999

10121000
args = parser.parse_args()
10131001
main(args)

benchmarks/kernels/benchmark_grouped_gemm_cutlass.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ def bench_run(results: list[benchmark.Measurement], model: str,
9090

9191
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
9292

93-
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
93+
topk_weights, topk_ids, token_expert_indices = fused_topk(
94+
a, score, topk, renormalize=False)
9495

9596
def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
9697
topk_weights: torch.Tensor, topk_ids: torch.Tensor,

benchmarks/kernels/benchmark_moe.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111
import ray
1212
import torch
13-
import triton
1413
from ray.experimental.tqdm_ray import tqdm
1514
from transformers import AutoConfig
1615

1716
from vllm.model_executor.layers.fused_moe.fused_moe import *
1817
from vllm.platforms import current_platform
18+
from vllm.triton_utils import triton
1919
from vllm.utils import FlexibleArgumentParser
2020

2121
FP8_DTYPE = current_platform.fp8_dtype()
@@ -115,8 +115,8 @@ def run():
115115
from vllm.model_executor.layers.fused_moe import override_config
116116
with override_config(config):
117117
if use_deep_gemm:
118-
topk_weights, topk_ids = fused_topk(x, input_gating, topk,
119-
False)
118+
topk_weights, topk_ids, token_expert_indices = fused_topk(
119+
x, input_gating, topk, False)
120120
return fused_experts(
121121
x,
122122
w1,
@@ -442,8 +442,14 @@ def tune(
442442
hidden_size, search_space,
443443
is_fp16, topk)
444444

445-
with torch.cuda.device(self.device_id) if current_platform.is_rocm(
446-
) else nullcontext():
445+
need_device_guard = False
446+
if current_platform.is_rocm():
447+
visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
448+
if visible_device != f"{self.device_id}":
449+
need_device_guard = True
450+
451+
with torch.cuda.device(
452+
self.device_id) if need_device_guard else nullcontext():
447453
for config in tqdm(search_space):
448454
try:
449455
kernel_time = benchmark_config(
@@ -578,6 +584,15 @@ def main(args: argparse.Namespace):
578584

579585
use_deep_gemm = bool(args.use_deep_gemm)
580586

587+
if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
588+
# Ray will set ROCR_VISIBLE_DEVICES for device visibility
589+
logger.warning(
590+
"Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
591+
"Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
592+
val = os.environ["HIP_VISIBLE_DEVICES"]
593+
os.environ["ROCR_VISIBLE_DEVICES"] = val
594+
del os.environ["HIP_VISIBLE_DEVICES"]
595+
581596
ray.init()
582597
num_gpus = int(ray.available_resources()["GPU"])
583598
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]

0 commit comments

Comments
 (0)