vllm-project
diff --git a/‎.buildkite/nightly-benchmarks/README.md
Lines changed: 4 additions & 4 deletions b/‎.buildkite/nightly-benchmarks/README.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/kernels/benchmark_bitblas.py
Lines changed: 3 additions & 1 deletion b/‎benchmarks/kernels/benchmark_bitblas.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎cmake/external_projects/vllm_flash_attn.cmake
Lines changed: 1 addition & 1 deletion b/‎cmake/external_projects/vllm_flash_attn.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/mamba/mamba_ssm/selective_scan.h
Lines changed: 3 additions & 0 deletions b/‎csrc/mamba/mamba_ssm/selective_scan.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎csrc/mamba/mamba_ssm/selective_scan_fwd.cu
Lines changed: 14 additions & 4 deletions b/‎csrc/mamba/mamba_ssm/selective_scan_fwd.cu
Lines changed: 14 additions & 4 deletions
diff --git a/‎docker/Dockerfile
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/assets/features/disagg_prefill/high_level_design.png
90.6 KB b/‎docs/assets/features/disagg_prefill/high_level_design.png
90.6 KB
diff --git a/‎docs/assets/features/disagg_prefill/workflow.png
87.9 KB b/‎docs/assets/features/disagg_prefill/workflow.png
87.9 KB
diff --git a/‎docs/configuration/conserving_memory.md
Lines changed: 15 additions & 17 deletions b/‎docs/configuration/conserving_memory.md
Lines changed: 15 additions & 17 deletions
diff --git a/‎docs/configuration/optimization.md
Lines changed: 29 additions & 44 deletions b/‎docs/configuration/optimization.md
Lines changed: 29 additions & 44 deletions
@@ -168,9 +168,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
 ### Workflow
 
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
-- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
-- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
-- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
+- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
+- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 
 ### Nightly tests
 
@@ -180,6 +180,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
 
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
 
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
@@ -3,14 +3,16 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
+from packaging import version
+
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
     MINIMUM_BITBLAS_VERSION,
 )
 
 try:
     import bitblas
 
-    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+    if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
         raise ImportError(
             "bitblas version is wrong. Please "
             f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
 
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG b99f8c821771fd11feb66d5c89661e9858fde359
+          GIT_TAG 6dbc6e011a3ebe9349eeb74578940dd7095436ba
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
 
@@ -45,6 +45,9 @@ struct SSMParamsBase {
     index_t out_d_stride;
     index_t out_z_batch_stride;
     index_t out_z_d_stride;
+    index_t ssm_states_batch_stride;
+    index_t ssm_states_dim_stride;  
+    index_t ssm_states_dstate_stride;
 
     // Common data pointers.
     void *__restrict__ A_ptr;
 
@@ -132,8 +132,10 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
-
+    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
+    cache_index * params.ssm_states_batch_stride + 
+    dim_id * kNRows * params.ssm_states_dim_stride;
+    
     float D_val[kNRows] = {0};
     if (params.D_ptr != nullptr) {
         #pragma unroll
@@ -248,7 +250,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 }
                 // Initialize running total
 
-                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
+                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0);
 
                 SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
                 typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
@@ -259,7 +261,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 if (threadIdx.x == 0) {
                     smem_running_prefix[state_idx] = prefix_op.running_prefix;
                     if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
                     }
                 }
                 #pragma unroll
@@ -481,6 +483,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_batch_stride = out.stride(1);
         params.out_d_stride = out.stride(0);
 
+        params.ssm_states_batch_stride = ssm_states.stride(0);
+        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dstate_stride = ssm_states.stride(2);
+
     }
     else{
         if (!is_variable_B) {
@@ -509,6 +515,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         }
         params.out_batch_stride = out.stride(0);
         params.out_d_stride = out.stride(1);
+        
+        params.ssm_states_batch_stride = ssm_states.stride(0);
+        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dstate_stride = ssm_states.stride(2);
     }
 }
 
 
@@ -392,7 +392,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
 # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
-ARG FLASHINFER_GIT_REF="v0.2.9"
+ARG FLASHINFER_GIT_REF="v0.2.10"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     git clone --depth 1 --recursive --shallow-submodules \
 
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
 
 If you run out of CPU RAM, try the following options:
 
-- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (Multi-modal models only) you can set the size of multi-modal processor cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB per API process + 4 GiB per engine core process)
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
 ## Multi-modal input limits
@@ -129,20 +129,18 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
 
 Here are some examples:
 
-??? code
-
-    ```python
-    from vllm import LLM
+```python
+from vllm import LLM
 
-    # Available for Qwen2-VL series models
-    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-            mm_processor_kwargs={
-                "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-            })
-
-    # Available for InternVL series models
-    llm = LLM(model="OpenGVLab/InternVL2-2B",
-            mm_processor_kwargs={
-                "max_dynamic_patch": 4,  # Default is 12
-            })
-    ```
+# Available for Qwen2-VL series models
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+          })
+
+# Available for InternVL series models
+llm = LLM(model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
+              "max_dynamic_patch": 4,  # Default is 12
+          })
+```
@@ -2,6 +2,9 @@
 
 This guide covers optimization strategies and performance tuning for vLLM V1.
 
+!!! tip
+    Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory.
+
 ## Preemption
 
 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
@@ -126,62 +129,44 @@ Data parallelism replicates the entire model across multiple GPU sets and proces
 Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
 Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
 
-## Reducing Memory Usage
-
-If you encounter out-of-memory issues, consider these strategies:
+## Input Processing
 
-### Context Length and Batch Size
+### Parallel Processing
 
-You can reduce memory usage by limiting the context length and batch size:
+You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
+This is useful when input processing (which is run inside the API server)
+becomes a bottleneck compared to model execution (which is run inside engine core)
+and you have excess CPU capacity.
 
-```python
-from vllm import LLM
+```console
+# Run 4 API processes and 1 engine core process
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4
 
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    max_model_len=2048,  # Limit context window
-    max_num_seqs=4       # Limit batch size
-)
+# Run 4 API processes and 2 engine core processes
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 ```
 
-### Adjust CUDA Graph Compilation
+!!! note
+    API server scale-out is only available for online inference.
 
-CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level:
-
-```python
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationLevel
-
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        cudagraph_capture_sizes=[1, 2, 4, 8]  # Capture fewer batch sizes
-    )
-)
-```
+!!! note
+    [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
+    because it requires a one-to-one correspondance between API and engine core processes.
 
-Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`:
+## Multi-Modal Caching
 
-```python
-from vllm import LLM
+### Processor Cache
 
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    enforce_eager=True  # Disable CUDA graph compilation
-)
-```
+By default, the multi-modal processor cache is enabled to avoid repeatedly processing
+the same multi-modal inputs via Hugging Face `AutoProcessor`,
+which commonly occurs in multi-turn conversations.
 
-### Multimodal Models
+You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable
+(default 4 GiB per API process + 4 GiB per engine core process).
 
-For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request:
+If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`:
 
 ```python
-from vllm import LLM
-
-# Accept up to 2 images per prompt
-llm = LLM(
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
-    limit_mm_per_prompt={"image": 2}
-)
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          disable_mm_preprocessor_cache=True)
 ```