EmbeddedLLM
diff --git a/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 4 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 20 additions & 22 deletions b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 20 additions & 22 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh‎
Lines changed: 4 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 14 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 14 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎docker/Dockerfile.cpu‎
Lines changed: 7 additions & 3 deletions b/‎docker/Dockerfile.cpu‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎docker/Dockerfile.neuron‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile.neuron‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/cli/README.md‎
Lines changed: 0 additions & 13 deletions b/‎docs/cli/README.md‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎docs/deployment/docker.md‎
Lines changed: 12 additions & 1 deletion b/‎docs/deployment/docker.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎docs/deployment/nginx.md‎
Lines changed: 0 additions & 10 deletions b/‎docs/deployment/nginx.md‎
Lines changed: 0 additions & 10 deletions
@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
 
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
   commands="${commands} \
 
@@ -8,67 +8,65 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
 # Setup cleanup
 remove_docker_container() { 
     set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
-  export BUILDKITE_BUILD_NUMBER=$3
 
   # offline inference
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run basic model test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/decoder_only/language -m cpu_model
-    pytest -v -s tests/models/embedding/language -m cpu_model
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
-    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/models/language/generation -m cpu_model
+    pytest -v -s tests/models/language/pooling -m cpu_model
+    pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
   # online serving
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -83,12 +81,12 @@ function cpu_tests() {
       --tokenizer facebook/opt-125m"
 
   # Run multi-lora tests
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/lora/test_qwen2vl.py"
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
@@ -155,6 +155,10 @@ run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
     "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 14 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 15 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
 
@@ -98,9 +98,7 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test # 10min
-  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amdexperimental, amdproduction]
-  amd_gpus: 4   # Just for the sake of queue testing
   fast_check: true
   source_file_dependencies:
   - vllm/core
@@ -114,7 +112,6 @@ steps:
   working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
-  amd_gpus: 2   # Just for the sake of queue testing
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/llm
@@ -207,7 +204,6 @@ steps:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
-  working_dir: "/vllm-workspace/tests" # optional
 
 - label: V1 Test
   mirror_hardwares: [amdexperimental]
@@ -261,7 +257,6 @@ steps:
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
-  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/
@@ -292,9 +287,7 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
-  working_dir: "/vllm-workspace/tests"
-  mirror_hardwares: [amdexperimental]
-  amd_gpus: 8
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
@@ -335,10 +328,8 @@ steps:
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 1h each
-  working_dir: "/vllm-workspace/tests"
-  # mirror_hardwares: [amdexperimental, amdproduction]
-  amd_gpus: 8
+- label: Kernels Core Operation Test
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
@@ -384,7 +375,6 @@ steps:
     - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
-  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amdexperimental, amdproduction]
   soft_fail: true
   source_file_dependencies:
@@ -461,7 +451,6 @@ steps:
     - pytest -v -s encoder_decoder
 
 - label: OpenAI-Compatible Tool Use # 20 min
-  working_dir: "/vllm-workspace/tests" 
   mirror_hardwares: [amdexperimental]
   fast_check: false
   source_file_dependencies:
 
@@ -189,11 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
   set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
 
-  #
-  # Set rocm version dev int.
-  #
-  list(APPEND VLLM_GPU_FLAGS "-DROCM_VERSION=${ROCM_VERSION_DEV_INT}")
-
   #
   # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
   # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
 
@@ -75,6 +75,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
     --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
 
@@ -85,7 +86,7 @@ WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl
+    apt-get install -y --no-install-recommends vim numactl xz-utils
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -108,8 +109,11 @@ FROM base AS vllm-test
 WORKDIR /workspace/
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
-    uv pip install -r requirements/test.txt
+    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
+    cp requirements/test.in requirements/test-cpu.in && \
+    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
+    uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
+    uv pip install -r requirements/cpu-test.txt
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
 
@@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements/neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
 
@@ -12,19 +12,6 @@ Available Commands:
 vllm {chat,complete,serve,bench,collect-env,run-batch}
 ```
 
-## Table of Contents
-
-- [serve](#serve)
-- [chat](#chat)
-- [complete](#complete)
-- [bench](#bench)
-  - [latency](#latency)
-  - [serve](#serve-1)
-  - [throughput](#throughput)
-- [collect-env](#collect-env)
-- [run-batch](#run-batch)
-- [More Help](#more-help)
-
 ## serve
 
 Start the vLLM OpenAI Compatible API server.
 
@@ -107,10 +107,21 @@ DOCKER_BUILDKIT=1 docker build . \
   -t vllm/vllm-gh200-openai:latest \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
-  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
   --build-arg vllm_fa_cmake_gpu_arches="90-real"
 ```
 
+!!! note
+    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
+
+    Run the following command on your host machine to register QEMU user static handlers:
+
+    ```console
+    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+    ```
+
+    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
+
 ## Use the custom-built vLLM Docker image
 
 To run vLLM with the custom-built Docker image:
 
@@ -5,16 +5,6 @@ title: Using Nginx
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
-Table of contents:
-
-1. [Build Nginx Container][nginxloadbalancer-nginx-build]
-2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf]
-3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container]
-4. [Create Docker Network][nginxloadbalancer-nginx-docker-network]
-5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container]
-6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx]
-7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx]
-
 [](){ #nginxloadbalancer-nginx-build }
 
 ## Build Nginx Container