neuralmagic
diff --git a/‎.buildkite/check-wheel-size.py‎
Lines changed: 4 additions & 4 deletions b/‎.buildkite/check-wheel-size.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-xpu-test.sh‎
Lines changed: 5 additions & 4 deletions b/‎.buildkite/scripts/hardware_ci/run-xpu-test.sh‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/auto_tune/README.md‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks/auto_tune/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/auto_tune/auto_tune.sh‎
Lines changed: 31 additions & 15 deletions b/‎benchmarks/auto_tune/auto_tune.sh‎
Lines changed: 31 additions & 15 deletions
diff --git a/‎benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/cpu_extension.cmake‎
Lines changed: 2 additions & 1 deletion b/‎cmake/cpu_extension.cmake‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cmake/external_projects/vllm_flash_attn.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/external_projects/vllm_flash_attn.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/attention/mla/sm100_cutlass_mla_kernel.cu‎
Lines changed: 8 additions & 8 deletions b/‎csrc/attention/mla/sm100_cutlass_mla_kernel.cu‎
Lines changed: 8 additions & 8 deletions
@@ -5,11 +5,11 @@
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 
 
 def print_top_10_largest_files(zip_file):
 
@@ -30,10 +30,11 @@ docker run \
     bash -c '
     set -e
     echo $ZE_AFFINITY_MASK
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     cd tests
     pytest -v -s v1/core
     pytest -v -s v1/engine
 
@@ -18,6 +18,7 @@ Easy, fast, and cheap LLM serving for everyone
 
 *Latest News* 🔥
 
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
 - [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 
@@ -31,6 +31,12 @@ cd vllm
 
 You must set the following variables at the top of the script before execution.
 
+   Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
 | Variable | Description | Example Value |
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
 
@@ -5,25 +5,41 @@
 
 TAG=$(date +"%Y_%m_%d_%H_%M")
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-SYSTEM="TPU"
-TP=1
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"
 
-echo "result file: $RESULT"
-echo "model: $MODEL"
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
 
 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH
@@ -213,7 +229,7 @@ run_benchmark() {
 
     pkill -if vllm
     sleep 10
-    printf '=%.0s' $(seq 1 20)
+    echo "===================="
     return 0
 }
 
 
@@ -62,7 +62,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
 
   CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200
 
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200
 
@@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
 
 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
     message(STATUS "Apple Silicon Detected")
+    set(APPLE_SILICON_FOUND TRUE)
     set(ENABLE_NUMA OFF)
     check_sysctl(hw.optional.neon ASIMD_FOUND)
     check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -189,7 +190,7 @@ else()
     set(USE_ACL OFF)
 endif()
 
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     FetchContent_Declare(
         oneDNN
         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
 
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
 
@@ -64,11 +64,11 @@ struct IsPersistent {
   static const bool value = v;
 };
 
-template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+template <typename T, typename TOut, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
 struct MlaSm100 {
   using Element = T;
   using ElementAcc = float;
-  using ElementOut = T;
+  using ElementOut = TOut;
 
   using TileShape = Shape<_128, _128, Shape<_512, _64>>;
   using TileShapeH = cute::tuple_element_t<0, TileShape>;
@@ -178,7 +178,7 @@ typename T::Fmha::Arguments args_from_options(
   return arguments;
 }
 
-template <typename Element, bool IsPaged128, typename PersistenceOption>
+template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
 void runMla(
     at::Tensor const& out,
     at::Tensor const& q_nope,
@@ -190,7 +190,7 @@ void runMla(
     double sm_scale,
     int64_t num_kv_splits,
     cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
+  using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
   typename MlaSm100Type::Fmha fmha;
   auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
 
@@ -233,13 +233,13 @@ void sm100_cutlass_mla_decode(
   DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
     DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
       if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
           out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
           out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
           out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else {
         TORCH_CHECK(false, "Unsupported input data type of MLA");
@@ -253,7 +253,7 @@ void sm100_cutlass_mla_decode(
 int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
   // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
   // which are float, so Element type here doesn't matter.
-  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
+  using MlaSm100Type = MlaSm100<cutlass::half_t, cutlass::half_t, true>;
 
   // Get split kv. Requires problem shape and sm_count only.
   typename MlaSm100Type::Fmha::Arguments arguments;