EmbeddedLLM
diff --git a/‎.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh‎
Lines changed: 107 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/tests/genai-perf-tests.json‎
Lines changed: 23 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/tests/genai-perf-tests.json‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎.buildkite/run-cpu-test.sh‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/run-cpu-test.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/run-hpu-test.sh‎
Lines changed: 10 additions & 2 deletions b/‎.buildkite/run-hpu-test.sh‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 8 additions & 2 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎.github/workflows/actionlint.yml‎
Lines changed: 0 additions & 41 deletions b/‎.github/workflows/actionlint.yml‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎.github/workflows/clang-format.yml‎
Lines changed: 0 additions & 55 deletions b/‎.github/workflows/clang-format.yml‎
Lines changed: 0 additions & 55 deletions
diff --git a/‎.github/workflows/codespell.yml‎
Lines changed: 0 additions & 47 deletions b/‎.github/workflows/codespell.yml‎
Lines changed: 0 additions & 47 deletions
@@ -43,7 +43,7 @@ main() {
 
 
 
-    # The figures should be genereated by a separate process outside the CI/CD pipeline
+    # The figures should be generated by a separate process outside the CI/CD pipeline
 
     # # generate figures
     # python3 -m pip install tabulate pandas matplotlib
 
@@ -301,6 +301,104 @@ run_serving_tests() {
   kill_gpu_processes
 }
 
+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
 
 prepare_dataset() {
 
@@ -328,12 +426,17 @@ main() {
 
   pip install -U transformers
 
+  pip install -r requirements-dev.txt
+  which genai-perf
+
   # check storage
   df -h
 
   ensure_installed wget
   ensure_installed curl
   ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d
 
   prepare_dataset
 
@@ -345,6 +448,10 @@ main() {
   # run the test
   run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
 
+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
   # upload benchmark results to buildkite
   python3 -m pip install tabulate pandas
   python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
 
@@ -0,0 +1,23 @@
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "num_prompts": 500,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
@@ -83,6 +83,6 @@ function cpu_tests() {
     tests/lora/test_qwen2vl.py"
 }
 
-# All of CPU tests are expected to be finished less than 25 mins.
+# All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
@@ -8,9 +8,17 @@ set -ex
 docker build -t hpu-test-env -f Dockerfile.hpu .
 
 # Setup cleanup
+# certain versions of HPU software stack have a bug that can
+# override the exit code of the script, so we need to use
+# separate remove_docker_container and remove_docker_container_and_exit
+# functions, while other platforms only need one remove_docker_container
+# function.
+EXITCODE=1
 remove_docker_container() { docker rm -f hpu-test || true; }
-trap remove_docker_container EXIT
+remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
+trap remove_docker_container_and_exit EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+EXITCODE=$?
@@ -52,7 +52,6 @@ steps:
   - tests/worker
   - tests/standalone_tests/lazy_torch_compile.py
   commands:
-  - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git  # Used by multimoda processing test
   - python3 standalone_tests/lazy_torch_compile.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
@@ -107,7 +106,7 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
@@ -126,11 +125,15 @@ steps:
   - tests/distributed
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
+  - examples/offline_inference/rlhf.py
   commands:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - python3 ../examples/offline_inference/rlhf.py
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2 
@@ -462,7 +465,10 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
   commands:
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,6 @@ function cpu_tests() {`
`83`	`83`	`tests/lora/test_qwen2vl.py"`
`84`	`84`	`}`
`85`	`85`
`86`		`-# All of CPU tests are expected to be finished less than 25 mins.`
	`86`	`+# All of CPU tests are expected to be finished less than 40 mins.`
`87`	`87`	`export -f cpu_tests`
`88`		`-timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"`
	`88`	`+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"`