Skip to content

Commit 7c2e05c

Browse files
authored
Merge branch 'main' into feat/qwen3-h100-moe-configs
2 parents 14699ba + 6c04638 commit 7c2e05c

File tree

109 files changed

+567
-649
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+567
-649
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -835,11 +835,11 @@ steps:
835835
- pytest -v -s tests/kernels/moe/test_flashinfer.py
836836
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
837837

838-
- label: GPT-OSS Eval (Blackwell)
838+
- label: Blackwell GPT-OSS Eval
839839
timeout_in_minutes: 60
840840
working_dir: "/vllm-workspace/"
841841
gpu: b200
842-
optional: true # disable while debugging
842+
optional: true # run on nightlies
843843
source_file_dependencies:
844844
- tests/evals/gpt_oss
845845
- vllm/model_executor/models/gpt_oss.py
@@ -866,6 +866,16 @@ steps:
866866
commands:
867867
- pytest -s -v tests/quantization/test_blackwell_moe.py
868868

869+
- label: Blackwell LM Eval Small Models
870+
timeout_in_minutes: 75
871+
gpu: b200
872+
optional: true # run on nightlies
873+
source_file_dependencies:
874+
- csrc/
875+
- vllm/model_executor/layers/quantization
876+
commands:
877+
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
878+
869879
##### 1 GPU test #####
870880
##### multi gpus test #####
871881

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
2323
# Any change to the VllmConfig changes can have a large user-facing impact,
2424
# so spam a lot of people
2525
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
26+
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
2627

2728
# vLLM V1
2829
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat

benchmarks/benchmark_ngram_proposer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def invoke_main() -> None:
164164
)
165165
parser.add_argument(
166166
"--batched", action="store_true", help="consider time to prepare batch"
167-
) # noqa: E501
167+
)
168168
parser.add_argument(
169169
"--num-iteration",
170170
type=int,

benchmarks/benchmark_serving_structured_output.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -909,13 +909,13 @@ def create_argument_parser():
909909
parser.add_argument(
910910
"--tokenizer",
911911
type=str,
912-
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
912+
help="Name or path of the tokenizer, if not using the default tokenizer.",
913913
)
914914
parser.add_argument(
915915
"--tokenizer-mode",
916916
type=str,
917917
default="auto",
918-
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
918+
help="Name or path of the tokenizer, if not using the default tokenizer.",
919919
)
920920
parser.add_argument(
921921
"--num-prompts",

csrc/cutlass_extensions/vllm_cutlass_library_extension.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ class MixedInputKernelScheduleType(enum.Enum):
7272
] = {
7373
**KernelScheduleTag, # type: ignore
7474
**{
75-
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",
76-
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
77-
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
75+
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
76+
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
77+
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
7878
},
7979
}

docs/deployment/integrations/production-stack.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ sudo kubectl port-forward svc/vllm-router-service 30080:80
5555
And then you can send out a query to the OpenAI-compatible API to check the available models:
5656

5757
```bash
58-
curl -o- http://localhost:30080/models
58+
curl -o- http://localhost:30080/v1/models
5959
```
6060

6161
??? console "Output"
@@ -78,7 +78,7 @@ curl -o- http://localhost:30080/models
7878
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
7979

8080
```bash
81-
curl -X POST http://localhost:30080/completions \
81+
curl -X POST http://localhost:30080/v1/completions \
8282
-H "Content-Type: application/json" \
8383
-d '{
8484
"model": "facebook/opt-125m",

docs/design/plugin_system.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Every plugin has three parts:
4949

5050
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
5151

52-
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
52+
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
5353

5454
## Guidelines for Writing Plugins
5555

examples/offline_inference/vision_language_pooling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
113113
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
114114
if query["modality"] == "text":
115115
text = query["text"]
116-
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
116+
prompt = f"Find me an everyday image that matches the given caption: {text}"
117117
image = None
118118
elif query["modality"] == "image":
119119
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501

examples/online_serving/disaggregated_serving/disagg_proxy_demo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,9 @@ async def forward_request(self, url, data, use_chunked=True):
203203
async with session.post(
204204
url=url, json=data, headers=headers
205205
) as response:
206-
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
206+
if 200 <= response.status < 300 or 400 <= response.status < 500:
207207
if use_chunked:
208-
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
208+
async for chunk_bytes in response.content.iter_chunked(
209209
1024
210210
):
211211
yield chunk_bytes

pyproject.toml

Lines changed: 0 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -56,86 +56,6 @@ include = ["vllm*"]
5656
"vllm/third_party/**" = ["ALL"]
5757
"vllm/version.py" = ["F401"]
5858
"vllm/_version.py" = ["ALL"]
59-
# TEMPORARY! These ignores will be fixed forward
60-
## Line length violations
61-
"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"]
62-
"tests/compile/piecewise/test_simple.py" = ["E501"]
63-
"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"]
64-
"tests/entrypoints/conftest.py" = ["E501"]
65-
"tests/entrypoints/openai/test_audio.py" = ["E501"]
66-
"tests/entrypoints/openai/test_chat.py" = ["E501"]
67-
"tests/entrypoints/openai/test_chat_template.py" = ["E501"]
68-
"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"]
69-
"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"]
70-
"tests/entrypoints/openai/test_video.py" = ["E501"]
71-
"tests/entrypoints/openai/test_vision.py" = ["E501"]
72-
"tests/entrypoints/test_chat_utils.py" = ["E501"]
73-
"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"]
74-
"tests/models/language/generation/test_gemma.py" = ["E501"]
75-
"tests/models/language/generation/test_mistral.py" = ["E501"]
76-
"tests/models/multimodal/generation/test_ultravox.py" = ["E501"]
77-
"tests/models/multimodal/generation/test_voxtral.py" = ["E501"]
78-
"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"]
79-
"tests/tool_use/test_tool_choice_required.py" = ["E501"]
80-
"tests/v1/attention/utils.py" = ["E501"]
81-
"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"]
82-
"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"]
83-
"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"]
84-
"tests/v1/logits_processors/test_custom_offline.py" = ["E501"]
85-
"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"]
86-
"vllm/compilation/collective_fusion.py" = ["E501"]
87-
"vllm/compilation/wrapper.py" = ["E501"]
88-
"vllm/config/vllm.py" = ["E501"]
89-
"vllm/distributed/device_communicators/all2all.py" = ["E501"]
90-
"vllm/entrypoints/openai/protocol.py" = ["E501"]
91-
"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"]
92-
"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"]
93-
"vllm/model_executor/models/bailing_moe.py" = ["E501"]
94-
"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"]
95-
"vllm/model_executor/models/llama4_eagle.py" = ["E501"]
96-
"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"]
97-
"vllm/model_executor/models/phi4mm.py" = ["E501"]
98-
"vllm/model_executor/models/qwen3_next.py" = ["E501"]
99-
"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"]
100-
"vllm/v1/attention/backends/mla/common.py" = ["E501"]
101-
"vllm/v1/engine/utils.py" = ["E501"]
102-
"vllm/v1/utils.py" = ["E501"]
103-
"vllm/v1/worker/gpu_model_runner.py" = ["E501"]
104-
## Simplification rules
105-
"tests/distributed/test_expert_placement.py" = ["SIM108"]
106-
"tests/kernels/attention/test_cutlass_mla_decode.py" = ["SIM108"]
107-
"tests/kernels/attention/test_flashmla.py" = ["SIM108"]
108-
"tests/kernels/attention/test_lightning_attn.py" = ["SIM108"]
109-
"tests/kernels/moe/test_pplx_moe.py" = ["SIM108"]
110-
"tests/kernels/quantization/test_cutlass_scaled_mm.py" = ["SIM108"]
111-
"tests/kernels/test_onednn.py" = ["SIM108"]
112-
"tests/kernels/utils.py" = ["SIM108"]
113-
"tests/multimodal/test_processing.py" = ["SIM108"]
114-
"vllm/attention/ops/triton_reshape_and_cache_flash.py" = ["SIM108"]
115-
"vllm/distributed/parallel_state.py" = ["SIM108"]
116-
"vllm/entrypoints/chat_utils.py" = ["SIM108"]
117-
"vllm/entrypoints/llm.py" = ["SIM108"]
118-
"vllm/executor/ray_distributed_executor.py" = ["SIM108", "SIM112"]
119-
"vllm/model_executor/layers/batch_invariant.py" = ["SIM108"]
120-
"vllm/model_executor/layers/fla/ops/chunk_o.py" = ["SIM108"]
121-
"vllm/model_executor/layers/fused_moe/fused_moe.py" = ["SIM108"]
122-
"vllm/model_executor/layers/fused_moe/layer.py" = ["SIM108"]
123-
"vllm/model_executor/layers/fused_moe/modular_kernel.py" = ["SIM108"]
124-
"vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py" = ["SIM108"]
125-
"vllm/model_executor/layers/layernorm.py" = ["SIM108"]
126-
"vllm/model_executor/layers/lightning_attn.py" = ["SIM108"]
127-
"vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py" = ["SIM103"]
128-
"vllm/model_executor/layers/quantization/compressed_tensors/utils.py" = ["SIM110"]
129-
"vllm/model_executor/layers/quantization/quark/utils.py" = ["SIM110"]
130-
"vllm/utils/__init__.py" = ["SIM108"]
131-
"vllm/v1/sample/ops/bad_words.py" = ["SIM108"]
132-
"vllm/v1/sample/rejection_sampler.py" = ["SIM108"]
133-
"vllm/v1/worker/tpu_model_runner.py" = ["SIM108"]
134-
"vllm/_custom_ops.py" = ["SIM108"]
135-
"tools/profiler/print_layerwise_table.py" = ["SIM118"]
136-
## Loop variable binding issues
137-
"tests/kernels/mamba/test_mamba_ssm_ssd.py" = ["B023"]
138-
# End of temporary ignores
13959

14060
[tool.ruff.lint]
14161
select = [

0 commit comments

Comments
 (0)