|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +if [ ! -f "$1" ]; then |
| 4 | + echo "Error: The env file '$1' does not exist." |
| 5 | + exit 1 # Exit the script with a non-zero status to indicate an error |
| 6 | +fi |
| 7 | + |
| 8 | +ENV_FILE=$1 |
| 9 | + |
| 10 | +# For testing on local vm, use `set -a` to export all variables |
| 11 | +source /etc/environment |
| 12 | +source $ENV_FILE |
| 13 | + |
| 14 | +remove_docker_container() { |
| 15 | + docker rm -f tpu-test || true; |
| 16 | + docker rm -f vllm-tpu || true; |
| 17 | + docker rm -f $CONTAINER_NAME || true; |
| 18 | +} |
| 19 | + |
| 20 | +trap remove_docker_container EXIT |
| 21 | + |
| 22 | +# Remove the container that might not be cleaned up in the previous run. |
| 23 | +remove_docker_container |
| 24 | + |
| 25 | +# Build docker image. |
| 26 | +# TODO: build the image outside the script and share the image with other |
| 27 | +# tpu test if building time is too long. |
| 28 | +DOCKER_BUILDKIT=1 docker build \ |
| 29 | + --build-arg max_jobs=16 \ |
| 30 | + --build-arg USE_SCCACHE=1 \ |
| 31 | + --build-arg GIT_REPO_CHECK=0 \ |
| 32 | + --tag vllm/vllm-tpu-bm \ |
| 33 | + --progress plain -f docker/Dockerfile.tpu . |
| 34 | + |
| 35 | +LOG_ROOT=$(mktemp -d) |
| 36 | +# If mktemp fails, set -e will cause the script to exit. |
| 37 | +echo "Results will be stored in: $LOG_ROOT" |
| 38 | + |
| 39 | +if [ -z "$HF_TOKEN" ]; then |
| 40 | + echo "Error: HF_TOKEN is not set or is empty." |
| 41 | + exit 1 |
| 42 | +fi |
| 43 | + |
| 44 | +# Make sure mounted disk or dir exists |
| 45 | +if [ ! -d "$DOWNLOAD_DIR" ]; then |
| 46 | + echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder." |
| 47 | + exit 1 |
| 48 | +fi |
| 49 | + |
| 50 | +echo "Run model $MODEL" |
| 51 | +echo |
| 52 | + |
| 53 | +echo "starting docker...$CONTAINER_NAME" |
| 54 | +echo |
| 55 | +docker run \ |
| 56 | + -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ |
| 57 | + --env-file $ENV_FILE \ |
| 58 | + -e HF_TOKEN="$HF_TOKEN" \ |
| 59 | + -e TARGET_COMMIT=$BUILDKITE_COMMIT \ |
| 60 | + -e MODEL=$MODEL \ |
| 61 | + -e WORKSPACE=/workspace \ |
| 62 | + --name $CONTAINER_NAME \ |
| 63 | + -d \ |
| 64 | + --privileged \ |
| 65 | + --network host \ |
| 66 | + -v /dev/shm:/dev/shm \ |
| 67 | + vllm/vllm-tpu-bm tail -f /dev/null |
| 68 | + |
| 69 | +echo "run script..." |
| 70 | +echo |
| 71 | +docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh" |
| 72 | + |
| 73 | +echo "copy result back..." |
| 74 | +VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt |
| 75 | +BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt |
| 76 | +docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" |
| 77 | +docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG" |
| 78 | + |
| 79 | +throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') |
| 80 | +echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput" |
| 81 | + |
| 82 | +if [ "$BUILDKITE" = "true" ]; then |
| 83 | + echo "Running inside Buildkite" |
| 84 | + buildkite-agent artifact upload "$VLLM_LOG" |
| 85 | + buildkite-agent artifact upload "$BM_LOG" |
| 86 | +else |
| 87 | + echo "Not running inside Buildkite" |
| 88 | +fi |
| 89 | + |
| 90 | +# |
| 91 | +# compare the throughput with EXPECTED_THROUGHPUT |
| 92 | +# and assert meeting the expectation |
| 93 | +# |
| 94 | +if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then |
| 95 | + echo "Failed to get the throughput" |
| 96 | + exit 1 |
| 97 | +fi |
| 98 | + |
| 99 | +if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then |
| 100 | + echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)" |
| 101 | + exit 1 |
| 102 | +fi |
0 commit comments