vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 12 additions & 2 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/benchmark_ngram_proposer.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_ngram_proposer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/cutlass_extensions/vllm_cutlass_library_extension.py‎
Lines changed: 3 additions & 3 deletions b/‎csrc/cutlass_extensions/vllm_cutlass_library_extension.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/deployment/integrations/production-stack.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/deployment/integrations/production-stack.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/design/plugin_system.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/design/plugin_system.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/offline_inference/vision_language_pooling.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference/vision_language_pooling.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/online_serving/disaggregated_serving/disagg_proxy_demo.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/online_serving/disaggregated_serving/disagg_proxy_demo.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 80 deletions b/‎pyproject.toml‎
Lines changed: 0 additions & 80 deletions
@@ -835,11 +835,11 @@ steps:
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
-- label: GPT-OSS Eval (Blackwell)
+- label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
   gpu: b200
-  optional: true # disable while debugging
+  optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
   - vllm/model_executor/models/gpt_oss.py
@@ -866,6 +866,16 @@ steps:
   commands:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
 
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 75
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
 
@@ -23,6 +23,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 
@@ -164,7 +164,7 @@ def invoke_main() -> None:
     )
     parser.add_argument(
         "--batched", action="store_true", help="consider time to prepare batch"
-    )  # noqa: E501
+    )
     parser.add_argument(
         "--num-iteration",
         type=int,
 
@@ -909,13 +909,13 @@ def create_argument_parser():
     parser.add_argument(
         "--tokenizer",
         type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",
     )
     parser.add_argument(
         "--tokenizer-mode",
         type=str,
         default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",
     )
     parser.add_argument(
         "--num-prompts",
 
@@ -72,8 +72,8 @@ class MixedInputKernelScheduleType(enum.Enum):
 ] = {
     **KernelScheduleTag,  # type: ignore
     **{
-        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",
-        MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
-        MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
+        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
+        MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",  # noqa: E501
+        MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",  # noqa: E501
     },
 }
@@ -55,7 +55,7 @@ sudo kubectl port-forward svc/vllm-router-service 30080:80
 And then you can send out a query to the OpenAI-compatible API to check the available models:
 
 ```bash
-curl -o- http://localhost:30080/models
+curl -o- http://localhost:30080/v1/models
 ```
 
 ??? console "Output"
@@ -78,7 +78,7 @@ curl -o- http://localhost:30080/models
 To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
 
 ```bash
-curl -X POST http://localhost:30080/completions \
+curl -X POST http://localhost:30080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "facebook/opt-125m",
 
@@ -49,7 +49,7 @@ Every plugin has three parts:
 
 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
-- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
 
 ## Guidelines for Writing Plugins
 
 
@@ -113,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
 def _get_vlm2vec_prompt_image(query: Query, image_token: str):
     if query["modality"] == "text":
         text = query["text"]
-        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
+        prompt = f"Find me an everyday image that matches the given caption: {text}"
         image = None
     elif query["modality"] == "image":
         prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
 
@@ -203,9 +203,9 @@ async def forward_request(self, url, data, use_chunked=True):
                 async with session.post(
                     url=url, json=data, headers=headers
                 ) as response:
-                    if 200 <= response.status < 300 or 400 <= response.status < 500:  # noqa: E501
+                    if 200 <= response.status < 300 or 400 <= response.status < 500:
                         if use_chunked:
-                            async for chunk_bytes in response.content.iter_chunked(  # noqa: E501
+                            async for chunk_bytes in response.content.iter_chunked(
                                 1024
                             ):
                                 yield chunk_bytes
 
@@ -56,86 +56,6 @@ include = ["vllm*"]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
-# TEMPORARY! These ignores will be fixed forward
-## Line length violations
-"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"]
-"tests/compile/piecewise/test_simple.py" = ["E501"]
-"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"]
-"tests/entrypoints/conftest.py" = ["E501"]
-"tests/entrypoints/openai/test_audio.py" = ["E501"]
-"tests/entrypoints/openai/test_chat.py" = ["E501"]
-"tests/entrypoints/openai/test_chat_template.py" = ["E501"]
-"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"]
-"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"]
-"tests/entrypoints/openai/test_video.py" = ["E501"]
-"tests/entrypoints/openai/test_vision.py" = ["E501"]
-"tests/entrypoints/test_chat_utils.py" = ["E501"]
-"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"]
-"tests/models/language/generation/test_gemma.py" = ["E501"]
-"tests/models/language/generation/test_mistral.py" = ["E501"]
-"tests/models/multimodal/generation/test_ultravox.py" = ["E501"]
-"tests/models/multimodal/generation/test_voxtral.py" = ["E501"]
-"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"]
-"tests/tool_use/test_tool_choice_required.py" = ["E501"]
-"tests/v1/attention/utils.py" = ["E501"]
-"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"]
-"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"]
-"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"]
-"tests/v1/logits_processors/test_custom_offline.py" = ["E501"]
-"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"]
-"vllm/compilation/collective_fusion.py" = ["E501"]
-"vllm/compilation/wrapper.py" = ["E501"]
-"vllm/config/vllm.py" = ["E501"]
-"vllm/distributed/device_communicators/all2all.py" = ["E501"]
-"vllm/entrypoints/openai/protocol.py" = ["E501"]
-"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"]
-"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"]
-"vllm/model_executor/models/bailing_moe.py" = ["E501"]
-"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"]
-"vllm/model_executor/models/llama4_eagle.py" = ["E501"]
-"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"]
-"vllm/model_executor/models/phi4mm.py" = ["E501"]
-"vllm/model_executor/models/qwen3_next.py" = ["E501"]
-"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"]
-"vllm/v1/attention/backends/mla/common.py" = ["E501"]
-"vllm/v1/engine/utils.py" = ["E501"]
-"vllm/v1/utils.py" = ["E501"]
-"vllm/v1/worker/gpu_model_runner.py" = ["E501"]
-## Simplification rules
-"tests/distributed/test_expert_placement.py" = ["SIM108"]
-"tests/kernels/attention/test_cutlass_mla_decode.py" = ["SIM108"]
-"tests/kernels/attention/test_flashmla.py" = ["SIM108"]
-"tests/kernels/attention/test_lightning_attn.py" = ["SIM108"]
-"tests/kernels/moe/test_pplx_moe.py" = ["SIM108"]
-"tests/kernels/quantization/test_cutlass_scaled_mm.py" = ["SIM108"]
-"tests/kernels/test_onednn.py" = ["SIM108"]
-"tests/kernels/utils.py" = ["SIM108"]
-"tests/multimodal/test_processing.py" = ["SIM108"]
-"vllm/attention/ops/triton_reshape_and_cache_flash.py" = ["SIM108"]
-"vllm/distributed/parallel_state.py" = ["SIM108"]
-"vllm/entrypoints/chat_utils.py" = ["SIM108"]
-"vllm/entrypoints/llm.py" = ["SIM108"]
-"vllm/executor/ray_distributed_executor.py" = ["SIM108", "SIM112"]
-"vllm/model_executor/layers/batch_invariant.py" = ["SIM108"]
-"vllm/model_executor/layers/fla/ops/chunk_o.py" = ["SIM108"]
-"vllm/model_executor/layers/fused_moe/fused_moe.py" = ["SIM108"]
-"vllm/model_executor/layers/fused_moe/layer.py" = ["SIM108"]
-"vllm/model_executor/layers/fused_moe/modular_kernel.py" = ["SIM108"]
-"vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py" = ["SIM108"]
-"vllm/model_executor/layers/layernorm.py" = ["SIM108"]
-"vllm/model_executor/layers/lightning_attn.py" = ["SIM108"]
-"vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py" = ["SIM103"]
-"vllm/model_executor/layers/quantization/compressed_tensors/utils.py" = ["SIM110"]
-"vllm/model_executor/layers/quantization/quark/utils.py" = ["SIM110"]
-"vllm/utils/__init__.py" = ["SIM108"]
-"vllm/v1/sample/ops/bad_words.py" = ["SIM108"]
-"vllm/v1/sample/rejection_sampler.py" = ["SIM108"]
-"vllm/v1/worker/tpu_model_runner.py" = ["SIM108"]
-"vllm/_custom_ops.py" = ["SIM108"]
-"tools/profiler/print_layerwise_table.py" = ["SIM118"]
-## Loop variable binding issues
-"tests/kernels/mamba/test_mamba_ssm_ssd.py" = ["B023"]
-# End of temporary ignores
 
 [tool.ruff.lint]
 select = [
Original file line number	Diff line number	Diff line change
`@@ -164,7 +164,7 @@ def invoke_main() -> None:`
`164`	`164`	`)`
`165`	`165`	`parser.add_argument(`
`166`	`166`	`"--batched", action="store_true", help="consider time to prepare batch"`
`167`		`- ) # noqa: E501`
	`167`	`+ )`
`168`	`168`	`parser.add_argument(`
`169`	`169`	`"--num-iteration",`
`170`	`170`	`type=int,`