Skip to content

Commit ce342c7

Browse files
committed
Merge remote-tracking branch 'upstream/main' into upstream_merge_25_02_17
2 parents b96c11c + ce77eb9 commit ce342c7

File tree

296 files changed

+15487
-5432
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

296 files changed

+15487
-5432
lines changed

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ steps:
7070
#key: block-h100
7171
#depends_on: ~
7272

73+
- label: "Cleanup H100"
74+
agents:
75+
queue: H100
76+
depends_on: ~
77+
command: docker system prune -a --volumes --force
78+
7379
- label: "H100"
7480
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
7581
agents:

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,11 @@ main() {
345345
check_gpus
346346
check_hf_token
347347

348+
# Set to v1 to run v1 benchmark
349+
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
350+
export VLLM_USE_V1=1
351+
fi
352+
348353
# dependencies
349354
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
350355
(which jq) || (apt-get update && apt-get -y install jq)

.buildkite/nightly-benchmarks/tests/latency-tests.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@
2929
"num-iters": 15
3030
}
3131
}
32-
]
32+
]

.buildkite/test-pipeline.yaml

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,17 @@ steps:
107107
mirror_hardwares: [amd]
108108
source_file_dependencies:
109109
- vllm/
110+
- tests/entrypoints/llm
111+
- tests/entrypoints/openai
112+
- tests/entrypoints/test_chat_utils
113+
- tests/entrypoints/offline_mode
110114
commands:
111115
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
112116
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
113117
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
114118
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
115119
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
116-
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
120+
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
117121
- pytest -v -s entrypoints/test_chat_utils.py
118122
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
119123

@@ -124,9 +128,10 @@ steps:
124128
source_file_dependencies:
125129
- vllm/distributed/
126130
- vllm/core/
127-
- tests/distributed
131+
- tests/distributed/test_utils
132+
- tests/distributed/test_pynccl
128133
- tests/spec_decode/e2e/test_integration_dist_tp4
129-
- tests/compile
134+
- tests/compile/test_basic_correctness
130135
- examples/offline_inference/rlhf.py
131136
- examples/offline_inference/rlhf_colocate.py
132137
commands:
@@ -174,6 +179,9 @@ steps:
174179
- vllm/
175180
- tests/engine
176181
- tests/tokenization
182+
- tests/test_sequence
183+
- tests/test_config
184+
- tests/test_logger
177185
commands:
178186
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
179187
# OOM in the CI unless we run this separately
@@ -197,7 +205,7 @@ steps:
197205
- VLLM_USE_V1=1 pytest -v -s v1/e2e
198206
# Integration test for streaming correctness (requires special branch).
199207
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
200-
- pytest -v -s entrypoints/openai/test_accuracy.py::test_lm_eval_accuracy_v1_engine
208+
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
201209

202210
- label: Examples Test # 25min
203211
working_dir: "/vllm-workspace/examples"
@@ -331,6 +339,14 @@ steps:
331339
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
332340
- bash ./run-tests.sh -c configs/models-small.txt -t 1
333341

342+
- label: OpenAI API correctness
343+
source_file_dependencies:
344+
- csrc/
345+
- vllm/entrypoints/openai/
346+
- vllm/model_executor/models/whisper.py
347+
commands: # LMEval+Transcription WER check
348+
- pytest -s entrypoints/openai/correctness/
349+
334350
- label: Encoder Decoder tests # 5min
335351
source_file_dependencies:
336352
- vllm/

.github/workflows/cleanup_pr_body.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1717

1818
- name: Set up Python
19-
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
19+
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
2020
with:
2121
python-version: '3.12'
2222

.github/workflows/pre-commit.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@ jobs:
1010
runs-on: ubuntu-latest
1111
steps:
1212
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
13-
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
13+
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
1414
with:
1515
python-version: "3.12"
1616
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
17+
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
1718
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
1819
with:
1920
extra_args: --all-files --hook-stage manual

.github/workflows/stale.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
actions: write
1414
runs-on: ubuntu-latest
1515
steps:
16-
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
16+
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
1717
with:
1818
# Increasing this value ensures that changes to this workflow
1919
# propagate to all issues and PRs in days rather than months

.pre-commit-config.yaml

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,14 @@ repos:
1313
rev: v0.9.3
1414
hooks:
1515
- id: ruff
16-
args: [--output-format, github]
16+
args: [--output-format, github, --fix]
1717
exclude: 'vllm/third_party/.*'
1818
- repo: https://github.com/codespell-project/codespell
1919
rev: v2.4.0
2020
hooks:
2121
- id: codespell
22-
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.*|vllm/third_party/.*'
22+
additional_dependencies: ['tomli']
23+
args: ['--toml', 'pyproject.toml']
2324
- repo: https://github.com/PyCQA/isort
2425
rev: 5.13.2
2526
hooks:
@@ -116,13 +117,6 @@ repos:
116117
language: python
117118
types: [python]
118119
exclude: 'vllm/third_party/.*'
119-
- id: suggestion
120-
name: Suggestion
121-
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
122-
language: system
123-
verbose: true
124-
pass_filenames: false
125-
exclude: 'vllm/third_party/.*'
126120
- id: check-filenames
127121
name: Check for spaces in all filenames
128122
entry: bash
@@ -133,3 +127,12 @@ repos:
133127
always_run: true
134128
pass_filenames: false
135129
exclude: 'vllm/third_party/.*'
130+
# Keep `suggestion` last
131+
- id: suggestion
132+
name: Suggestion
133+
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
134+
language: system
135+
verbose: true
136+
pass_filenames: false
137+
exclude: 'vllm/third_party/.*'
138+
# Insert new entries above the `suggestion` entry

CMakeLists.txt

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
262262
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
263263

264264
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
265-
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
265+
# Please keep this in sync with FetchContent_Declare line below.
266+
set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
266267

267268
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
268269
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -279,6 +280,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
279280
FetchContent_Declare(
280281
cutlass
281282
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
283+
# Please keep this in sync with CUTLASS_REVISION line above.
282284
GIT_TAG v3.7.0
283285
GIT_PROGRESS TRUE
284286

@@ -298,8 +300,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
298300
"csrc/custom_all_reduce.cu"
299301
"csrc/permute_cols.cu"
300302
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
303+
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
301304
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
302-
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
303305
"csrc/cutlass_extensions/common.cpp")
304306

305307
set_gencode_flags_for_srcs(
@@ -392,8 +394,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
392394
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
393395
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
394396
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
395-
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
396-
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
397+
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
397398
set_gencode_flags_for_srcs(
398399
SRCS "${SRCS}"
399400
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -411,6 +412,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
411412
endif()
412413
endif()
413414

415+
# FP4 Archs and flags
416+
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
417+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
418+
set(SRCS
419+
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
420+
)
421+
set_gencode_flags_for_srcs(
422+
SRCS "${SRCS}"
423+
CUDA_ARCHS "${FP4_ARCHS}")
424+
list(APPEND VLLM_EXT_SRC "${SRCS}")
425+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
426+
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
427+
else()
428+
message(STATUS "Not building NVFP4 as no compatible archs were found.")
429+
# clear FP4_ARCHS
430+
set(FP4_ARCHS)
431+
endif()
414432

415433
#
416434
# Machete kernels
@@ -497,7 +515,7 @@ define_gpu_extension_target(
497515
SOURCES ${VLLM_EXT_SRC}
498516
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
499517
ARCHITECTURES ${VLLM_GPU_ARCHES}
500-
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
518+
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
501519
USE_SABI 3
502520
WITH_SOABI)
503521

Dockerfile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,19 +195,22 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
195195
--mount=type=cache,target=/root/.cache/pip \
196196
python3 -m pip install dist/*.whl --verbose
197197

198-
# How to build this FlashInfer wheel:
198+
# If we need to build FlashInfer wheel before its release:
199199
# $ export FLASHINFER_ENABLE_AOT=1
200200
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
201201
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
202202
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
203203
# $ cd flashinfer
204204
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
205+
# $ rm -rf build
205206
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
207+
# $ ls dist
208+
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
206209

207210
RUN --mount=type=cache,target=/root/.cache/pip \
208211
. /etc/environment && \
209212
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
210-
python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
213+
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
211214
fi
212215
COPY examples examples
213216

0 commit comments

Comments
 (0)