EmbeddedLLM
diff --git a/‎.buildkite/scripts/ci-clean-log.sh‎
Lines changed: 17 additions & 0 deletions b/‎.buildkite/scripts/ci-clean-log.sh‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh‎
Lines changed: 2 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.buildkite/scripts/rerun-test.sh‎
Lines changed: 18 additions & 0 deletions b/‎.buildkite/scripts/rerun-test.sh‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎.buildkite/scripts/tpu/cleanup_docker.sh‎
Lines changed: 24 additions & 0 deletions b/‎.buildkite/scripts/tpu/cleanup_docker.sh‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.buildkite/scripts/tpu/config_v6e_1.env‎
Lines changed: 14 additions & 0 deletions b/‎.buildkite/scripts/tpu/config_v6e_1.env‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.buildkite/scripts/tpu/docker_run_bm.sh‎
Lines changed: 102 additions & 0 deletions b/‎.buildkite/scripts/tpu/docker_run_bm.sh‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎.buildkite/scripts/tpu/run_bm.sh‎
Lines changed: 94 additions & 0 deletions b/‎.buildkite/scripts/tpu/run_bm.sh‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/mergify.yml‎
Lines changed: 14 additions & 0 deletions b/‎.github/mergify.yml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 5 deletions
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
@@ -7,6 +7,7 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
   if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
       podman rm -f "$container_id" || true
   fi
   podman system prune -f
@@ -37,7 +38,7 @@ function cpu_tests() {
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
 
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+
+TEST=$1
+COUNT=1
+
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=512
+MAX_NUM_BATCHED_TOKENS=512
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+if [ ! -f "$1" ]; then
+  echo "Error: The env file '$1' does not exist."
+  exit 1  # Exit the script with a non-zero status to indicate an error
+fi
+
+ENV_FILE=$1
+
+# For testing on local vm, use `set -a` to export all variables
+source /etc/environment
+source $ENV_FILE
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+    docker rm -f $CONTAINER_NAME || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build docker image.
+# TODO: build the image outside the script and share the image with other
+# tpu test if building time is too long.
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg max_jobs=16 \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg GIT_REPO_CHECK=0 \
+  --tag vllm/vllm-tpu-bm \
+  --progress plain -f docker/Dockerfile.tpu .
+
+LOG_ROOT=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $LOG_ROOT"
+
+if [ -z "$HF_TOKEN" ]; then
+  echo "Error: HF_TOKEN is not set or is empty."  
+  exit 1
+fi
+
+# Make sure mounted disk or dir exists
+if [ ! -d "$DOWNLOAD_DIR" ]; then
+    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
+    exit 1
+fi
+
+echo "Run model $MODEL"
+echo
+
+echo "starting docker...$CONTAINER_NAME"
+echo    
+docker run \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
+ -e WORKSPACE=/workspace \
+ --name $CONTAINER_NAME \
+ -d \
+ --privileged \
+ --network host \
+ -v /dev/shm:/dev/shm \
+ vllm/vllm-tpu-bm tail -f /dev/null
+
+echo "run script..."
+echo
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
+
+echo "copy result back..."
+VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
+BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
+docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
+docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
+
+if [ "$BUILDKITE" = "true" ]; then
+  echo "Running inside Buildkite"
+  buildkite-agent artifact upload "$VLLM_LOG" 
+  buildkite-agent artifact upload "$BM_LOG"
+else
+  echo "Not running inside Buildkite"
+fi
+
+#
+# compare the throughput with EXPECTED_THROUGHPUT 
+# and assert meeting the expectation
+# 
+if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  echo "Failed to get the throughput"
+  exit 1
+fi
+
+if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
+  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
+  exit 1
+fi
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -euo pipefail
+
+VLLM_LOG="$WORKSPACE/vllm_log.txt"
+BM_LOG="$WORKSPACE/bm_log.txt"
+
+if [ -n "$TARGET_COMMIT" ]; then
+  head_hash=$(git rev-parse HEAD)
+  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
+    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
+    exit 1
+  fi
+fi
+
+echo "model: $MODEL"
+echo
+
+#
+# create a log folder
+#
+mkdir "$WORKSPACE/log"
+
+# TODO: Move to image building.
+pip install pandas
+pip install datasets
+
+#
+# create sonnet_4x
+#
+echo "Create sonnet_4x.txt"
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+ do
+  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+#
+# start vllm service in backend
+#
+echo "lanching vllm..."
+echo "logging to $VLLM_LOG"
+echo
+
+VLLM_USE_V1=1 vllm serve $MODEL \
+ --seed 42 \
+ --disable-log-requests \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --no-enable-prefix-caching \
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+
+
+echo "wait for 20 minutes.."
+echo
+# sleep 1200
+# wait for 10 minutes...
+for i in {1..120}; do
+    # TODO: detect other type of errors.
+    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
+        echo "Detected RuntimeError, exiting."
+        exit 1
+    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
+        echo "Application started"
+        break
+    else
+        echo "wait for 10 seconds..."
+        sleep 10
+    fi
+done
+
+#
+# run test
+#
+echo "run benchmark test..."
+echo "logging to $BM_LOG"
+echo
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model $MODEL  \
+    --dataset-name sonnet \
+    --dataset-path benchmarks/sonnet_4x.txt \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
+    --ignore-eos > "$BM_LOG"
+
+echo "completed..."
+echo
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput: $throughput"
+echo
@@ -424,6 +424,9 @@ steps:
   - vllm/model_executor/layers/quantization
   - tests/quantization
   commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
 
@@ -36,6 +36,20 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-llama
+  description: Automatically apply llama label
+  conditions:
+    - or:
+      - files~=^examples/.*llama.*\.py
+      - files~=^tests/.*llama.*\.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
+      - files~=^vllm/model_executor/models/.*llama.*\.py
+      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+  actions:
+    label:
+      add:
+        - llama
+
 - name: label-multi-modality
   description: Automatically apply multi-modality label
   conditions:
 
@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # kernels for the remaining archs that are not already built for 3x.
   # (Build 8.9 for FP8)
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -543,8 +543,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUTLASS MoE kernels
 
   # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
+  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # if it's possible to compile MoE kernels that use its output.
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #