Skip to content

Commit e836699

Browse files
Add nightly benchmark run on GKE (#456)
* Add github workflow to run benchmark on GKE * Add LD_LIBRARY_PATH to scenario so it doesn't apply to all cases * Escape LD_LIBRARY_PATH so it is treated as a fixed string
1 parent 90464ad commit e836699

File tree

2 files changed

+131
-0
lines changed

2 files changed

+131
-0
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
name: CI - Nightly Benchmark on GKE
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
input_dir:
7+
description: 'Input directory for benchmark results'
8+
required: false
9+
default: '/tmp/cicd/analysis'
10+
output_dir:
11+
description: 'Output directory name'
12+
required: false
13+
default: ''
14+
15+
# push:
16+
# branches:
17+
# - main
18+
19+
schedule:
20+
- cron: '0 0 * * *'
21+
22+
jobs:
23+
run-benchmark-gke:
24+
name: CI - Nightly Benchmark on GKE
25+
runs-on: [k8s-util]
26+
timeout-minutes: 240
27+
28+
env:
29+
GCP_PROJECT_ID: llm-d-scale
30+
GKE_CLUSTER_NAME: llm-d-e2e-us-east5
31+
GKE_CLUSTER_ZONE: us-east5
32+
GATEWAY: gke-l7-regional-external-managed
33+
GATEWAY_TYPE: gke
34+
35+
steps:
36+
- name: Checkout code
37+
uses: actions/checkout@v4
38+
- uses: actions/setup-python@v6
39+
with:
40+
python-version: '3.11'
41+
42+
- name: Display OS used
43+
run: |
44+
cat /etc/*os-*
45+
shell: bash
46+
47+
- name: Set input and output directory environment variables
48+
run: |
49+
DEFAULT_INPUT_DIR=/tmp/cicd/analysis
50+
INPUT_DIR="${{ github.event.inputs.input_dir }}"
51+
if [ -z "$INPUT_DIR" ]; then
52+
INPUT_DIR="$DEFAULT_INPUT_DIR"
53+
fi
54+
echo "INPUT_DIR=$INPUT_DIR" >> $GITHUB_ENV
55+
56+
if [ -z "${{ github.event.inputs.output_dir }}" ]; then
57+
timestamp=$(date -u +%Y%m%dT%H%M%SZ)
58+
echo "OUTPUT_DIR=benchmark-results-${timestamp}" >> $GITHUB_ENV
59+
echo "Using generated output dir: benchmark-results-${timestamp}"
60+
else
61+
echo "OUTPUT_DIR=${{ github.event.inputs.output_dir }}" >> $GITHUB_ENV
62+
echo "Using provided output dir: ${{ github.event.inputs.output_dir }}"
63+
fi
64+
65+
- name: Authenticate to Google Cloud
66+
id: auth
67+
uses: google-github-actions/auth@b7593ed2efd1c1617e1b0254da33b86225adb2a5
68+
with:
69+
credentials_json: ${{ secrets.GKE_SA_KEY }}
70+
71+
- name: Set up gcloud CLI and kubectl
72+
uses: google-github-actions/setup-gcloud@cb1e50a9932213ecece00a606661ae9ca44f3397
73+
with:
74+
project_id: ${{ env.GCP_PROJECT_ID }}
75+
install_components: 'kubectl,gke-gcloud-auth-plugin'
76+
77+
- name: Get GKE credentials
78+
run: |
79+
gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
80+
81+
- name: Run install_deps.sh
82+
run: |
83+
sudo apt-get update
84+
./setup/install_deps.sh
85+
shell: bash
86+
87+
- name: Install config explorer dependencies
88+
run: pip install ./config_explorer
89+
shell: bash
90+
91+
- name: Cleanup target cloud (standalone)
92+
env:
93+
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
94+
run: ./setup/teardown.sh -c gke_H100_fb -t standalone -d
95+
96+
- name: Standup target cloud (standalone)
97+
env:
98+
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
99+
run: ./setup/standup.sh -c gke_H100_fb -t standalone
100+
101+
- name: Run benchmark (standalone, inference-perf)
102+
env:
103+
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
104+
run: ./setup/run.sh -c gke_H100_fb -t standalone
105+
106+
- name: Cleanup target cloud (standalone)
107+
env:
108+
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
109+
run: ./setup/teardown.sh -c gke_H100_fb -t standalone -d
110+
111+
- name: Archive benchmark results as GitHub artifact
112+
if: success() || failure()
113+
uses: actions/upload-artifact@v4
114+
with:
115+
name: ${{ env.OUTPUT_DIR }}
116+
path: ${{ env.INPUT_DIR }}
117+
retention-days: 14

scenarios/cicd/gke_H100_fb.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
export LLMDBENCH_CONTROL_WORK_DIR=/tmp/cicd/
2+
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.2-1B"
3+
export LLMDBENCH_VLLM_COMMON_NAMESPACE=llmdbenchcicd
4+
export LLMDBENCH_HARNESS_NAMESPACE=llmdbenchcicd
5+
export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb
6+
export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=standard-rwx
7+
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
8+
export LLMDBENCH_VLLM_MODELSERVICE_RELEASE=llmdbenchcicd
9+
export LLMDBENCH_VLLM_COMMON_REPLICAS=1
10+
export LLMDBENCH_VLLM_COMMON_ACCELERATOR_NR=1
11+
export LLMDBENCH_HARNESS_NAME=inference-perf
12+
export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=sanity_random.yaml
13+
export LD_LIBRARY_PATH="\${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64"
14+
export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=LLMDBENCH_VLLM_STANDALONE_VLLM_ALLOW_LONG_MAX_MODEL_LEN,LLMDBENCH_VLLM_STANDALONE_VLLM_SERVER_DEV_MODE,LD_LIBRARY_PATH

0 commit comments

Comments
 (0)