neuralmagic
diff --git a/‎.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/nightly-benchmarks/README.md‎
Lines changed: 5 additions & 4 deletions b/‎.buildkite/nightly-benchmarks/README.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎.buildkite/nightly-benchmarks/benchmark-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/benchmark-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/nightly-benchmarks/tests/descriptions.md‎ renamed to ‎.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md‎ b/‎.buildkite/nightly-benchmarks/tests/descriptions.md‎ renamed to ‎.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md‎
diff --git a/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/nightly-benchmarks/run-benchmarks-suite.sh‎ renamed to ‎.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh‎
Lines changed: 28 additions & 17 deletions b/‎.buildkite/nightly-benchmarks/run-benchmarks-suite.sh‎ renamed to ‎.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh‎
Lines changed: 28 additions & 17 deletions
diff --git a/‎.buildkite/run-amd-test.sh‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/run-amd-test.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/run-tpu-test.sh‎
Lines changed: 1 addition & 2 deletions b/‎.buildkite/run-tpu-test.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 45 additions & 14 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 45 additions & 14 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/100-documentation.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/100-documentation.yml‎
Lines changed: 7 additions & 0 deletions
@@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.409
+    value: 0.419
   - name: "exact_match,flexible-extract"
-    value: 0.406
+    value: 0.416
 limit: 1000
 num_fewshot: 5
@@ -34,17 +34,18 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
 
 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
-- Every commit for those PRs with `perf-benchmarks` label.
+- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 
 Nightly benchmark will be triggered when:
-- Every commit for those PRs with `nightly-benchmarks` label.
+- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
 
 
 
 ## Performance benchmark details
 
-See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+
+See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 
 
 #### Latency test
@@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`:
 
 In this example:
 -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
--  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 
 
@@ -21,7 +21,7 @@ steps:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
             command:
-            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
             resources:
               limits:
                 nvidia.com/gpu: 8
 
@@ -174,8 +174,8 @@ def results_to_json(latency, throughput, serving):
     # document the result
     with open(results_folder / "benchmark_results.md", "w") as f:
 
-        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
+                                "performance-benchmarks-descriptions.md")
         results = results.format(
             latency_tests_markdown_table=latency_md_table,
             throughput_tests_markdown_table=throughput_md_table,
 
@@ -37,9 +37,9 @@ check_hf_token() {
 ensure_sharegpt_downloaded() {
   local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
   if [ ! -f "$FILE" ]; then
-      wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
   else
-      echo "$FILE already exists."
+    echo "$FILE already exists."
   fi
 }
 
@@ -68,11 +68,29 @@ wait_for_server() {
     done' && return 0 || return 1
 }
 
+kill_processes_launched_by_current_bash() {
+  # Kill all python processes launched from current bash script
+  current_shell_pid=$$
+  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
+  if [ -n "$processes" ]; then
+    echo "Killing the following processes matching '$1':"
+    echo "$processes"
+    echo "$processes" | xargs kill -9
+  else
+    echo "No processes found matching '$1'."
+  fi
+}
+
 kill_gpu_processes() {
-  # kill all processes on GPU.
 
-  ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
-  ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
+  ps -aux
+  lsof -t -i:8000 | xargs -r kill -9
+  pkill -f pt_main_thread
+  # this line doesn't work now
+  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
+  pkill -f python3
+  pkill -f /usr/bin/python3
+
 
   # wait until GPU memory usage smaller than 1GB
   while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
@@ -82,11 +100,6 @@ kill_gpu_processes() {
   # remove vllm config file
   rm -rf ~/.config/vllm
 
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 
 upload_to_buildkite() {
@@ -104,7 +117,7 @@ upload_to_buildkite() {
   fi
 
   # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
   $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 
@@ -156,7 +169,7 @@ run_latency_tests() {
         latency_command: $latency,
         gpu_type: $gpu
       }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
 
     # run the benchmark
     eval "$latency_command"
@@ -166,7 +179,6 @@ run_latency_tests() {
   done
 }
 
-
 run_throughput_tests() {
   # run throughput tests using `benchmark_throughput.py`
   # $1: a json file specifying throughput test cases
@@ -214,7 +226,7 @@ run_throughput_tests() {
         throughput_command: $command,
         gpu_type: $gpu
       }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
 
     # run the benchmark
     eval "$throughput_command"
@@ -246,7 +258,6 @@ run_serving_tests() {
       continue
     fi
 
-
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.server_parameters')
     client_params=$(echo "$params" | jq -r '.client_parameters')
@@ -324,7 +335,7 @@ run_serving_tests() {
           client_command: $client,
           gpu_type: $gpu
         }')
-      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
 
     done
 
@@ -341,6 +352,7 @@ main() {
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
+  (which lsof) || (apt-get update && apt-get install -y lsof)
 
   # get the current IP address, required by benchmark_serving.py
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
@@ -359,7 +371,6 @@ main() {
   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
   run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
 
-
   # postprocess benchmarking results
   pip install tabulate pandas
   python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
 
@@ -75,6 +75,7 @@ docker run \
         --network host \
         --shm-size=16gb \
         --rm \
+        -e HIP_VISIBLE_DEVICES=0 \
         -e HF_TOKEN \
         -v ${HF_CACHE}:${HF_MOUNT} \
         -e HF_HOME=${HF_MOUNT} \
 
@@ -12,5 +12,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
-    python3 /workspace/vllm/examples/offline_inference_tpu.py
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
@@ -86,15 +86,18 @@ steps:
   - vllm/
   commands:
   - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s entrypoints/llm
+  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
+  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/openai
 
 - label: Distributed Tests (4 GPUs) # 10min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   fast_check: true
   source_file_dependencies:
-  - vllm/
+  - vllm/distributed/
+  - vllm/core/
   - tests/distributed
   - tests/spec_decode/e2e/test_integration_dist_tp4
   commands:
@@ -111,10 +114,10 @@ steps:
   commands:
   - pytest -v -s metrics 
   - "pip install \
-      opentelemetry-sdk \
-      opentelemetry-api \
-      opentelemetry-exporter-otlp \
-      opentelemetry-semantic-conventions-ai"
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
   - pytest -v -s tracing
 
 ##### fast check tests  #####
@@ -230,12 +233,13 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test # 11min
+  mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
   - tests/tensorizer_loader
   commands:
-    - apt-get install -y curl libsodium23
+    - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
 
@@ -283,11 +287,15 @@ steps:
   num_gpus: 2
   num_nodes: 2
   source_file_dependencies:
-  - vllm/
-  - tests/distributed/test_same_node
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
@@ -297,8 +305,11 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
-  - vllm/
-  - tests/distributed
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
   commands:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -311,13 +322,33 @@ steps:
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
+- label: Multi-step Tests (4 GPUs) # 21min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/model_executor/layers/sampler.py
+  - vllm/sequence.py
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/multi_step_worker.py
+  - vllm/worker/model_runner_base.py
+  - vllm/worker/model_runner.py
+  - vllm/worker/multi_step_model_runner.py
+  - vllm/engine
+  - tests/multi_step
+  commands:
+  - pytest -v -s multi_step/test_correctness_async_llm.py
+  - pytest -v -s multi_step/test_correctness_llm.py
+
 - label: Pipeline Parallelism Test # 23min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
-  - vllm/
-  - tests/distributed/test_pp_cudagraph.py
-  - tests/distributed/test_pipeline_parallel
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
   commands:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
 
@@ -20,3 +20,10 @@ body:
   attributes:
     value: >
       Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true