Skip to content

Commit f8d74f2

Browse files
authored
Migrate to consolidated e2e test suite (#760)
* consolidate e2es infra Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * update ci Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * add full e2e tests on kind with approval on ci Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * rm unused file Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * update ci image build Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * ci updates Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * make ci final output clear Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * update e2e test full tri gger ci Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * ci manual trigger update Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * valid ci e2e triggers group Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> --------- Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent f881df1 commit f8d74f2

24 files changed

+6028
-49
lines changed

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -919,18 +919,31 @@ jobs:
919919

920920
- name: Run OpenShift E2E tests
921921
env:
922-
CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
922+
# Consolidated e2e test environment variables
923+
ENVIRONMENT: openshift
924+
USE_SIMULATOR: "false"
925+
SCALE_TO_ZERO_ENABLED: "true"
926+
WVA_NAMESPACE: ${{ env.WVA_NAMESPACE }}
923927
MONITORING_NAMESPACE: openshift-user-workload-monitoring
924928
LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }}
929+
# Legacy variables for backward compatibility (if needed by tests)
930+
CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
925931
# Multi-model testing: secondary namespace for Model B
926932
LLMD_NAMESPACE_B: ${{ env.LLMD_NAMESPACE_B }}
927933
GATEWAY_NAME: infra-inference-scheduling-inference-gateway-istio
928934
DEPLOYMENT: ms-inference-scheduling-llm-d-modelservice-decode
929935
# Pass WVA_RELEASE_NAME so test can filter for current run's resources
930936
WVA_RELEASE_NAME: ${{ env.WVA_RELEASE_NAME }}
937+
MODEL_ID: ${{ env.MODEL_ID }}
938+
REQUEST_RATE: ${{ env.REQUEST_RATE }}
939+
NUM_PROMPTS: ${{ env.NUM_PROMPTS }}
931940
run: |
932-
echo "Running OpenShift E2E tests with configuration:"
933-
echo " CONTROLLER_NAMESPACE: $CONTROLLER_NAMESPACE"
941+
echo "Running consolidated E2E tests on OpenShift with configuration:"
942+
echo " ENVIRONMENT: $ENVIRONMENT"
943+
echo " USE_SIMULATOR: $USE_SIMULATOR"
944+
echo " SCALE_TO_ZERO_ENABLED: $SCALE_TO_ZERO_ENABLED"
945+
echo " WVA_NAMESPACE: $WVA_NAMESPACE"
946+
echo " MONITORING_NAMESPACE: $MONITORING_NAMESPACE"
934947
echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
935948
echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B (multi-model)"
936949
echo " DEPLOYMENT: $DEPLOYMENT"
@@ -939,7 +952,7 @@ jobs:
939952
echo " REQUEST_RATE: $REQUEST_RATE"
940953
echo " NUM_PROMPTS: $NUM_PROMPTS"
941954
echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
942-
make test-e2e-openshift
955+
make test-e2e-full
943956
944957
- name: Cleanup infrastructure
945958
# Cleanup on success or cancellation, but NOT on failure (preserve for debugging)

.github/workflows/ci-pr-checks.yaml

Lines changed: 230 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,86 @@
11
name: CI - PR Checks
22

3+
# Cancel previous runs on the same PR when new commits are pushed
4+
# Only group by PR number for legitimate triggers (pull_request, workflow_dispatch, /trigger-e2e-full comments)
5+
# Regular comments get a unique group (run_id) so they don't cancel in-progress test runs
6+
#
7+
# Logic:
8+
# - Regular comments (not /trigger-e2e-full): unique group prevents cancellation of real tests
9+
# - Valid triggers: group 'ci-pr-checks-{pr_number}' (can cancel previous runs for same PR)
10+
# - Fallback chain for ID: pull_request.number -> issue.number -> run_id
11+
#
12+
# NOTE: Valid command list (/trigger-e2e-full) must stay in sync with check-full-tests job validation
13+
concurrency:
14+
group: >-
15+
${{
16+
github.event_name == 'issue_comment' &&
17+
!contains(github.event.comment.body, '/trigger-e2e-full')
18+
&& format('comment-isolated-{0}', github.run_id)
19+
|| format('ci-pr-checks-{0}',
20+
github.event.pull_request.number
21+
|| github.event.issue.number
22+
|| github.run_id)
23+
}}
24+
cancel-in-progress: true
25+
326
on:
427
pull_request:
528
branches:
629
- main
730
- dev
31+
# Allow manual triggering of full e2e tests
32+
workflow_dispatch:
33+
inputs:
34+
run_full_tests:
35+
description: 'Run full e2e test suite on Kind (default: smoke tests only)'
36+
required: false
37+
default: false
38+
type: boolean
39+
# Allow triggering via PR comments
40+
issue_comment:
41+
types: [created]
842

943
jobs:
1044
# Check if PR contains code changes (not just docs/metadata)
1145
check-code-changes:
1246
runs-on: ubuntu-latest
1347
permissions:
1448
contents: read
49+
pull-requests: read # For reading PR details when triggered via issue_comment
1550
outputs:
16-
has_code_changes: ${{ steps.filter.outputs.code }}
51+
has_code_changes: ${{ steps.set-output.outputs.has_code_changes }}
1752
steps:
53+
- name: Get PR number for issue_comment events
54+
id: pr-info
55+
if: github.event_name == 'issue_comment'
56+
uses: actions/github-script@v7
57+
with:
58+
script: |
59+
const issue = context.payload.issue;
60+
if (!issue.pull_request) {
61+
core.setOutput('pr_number', '');
62+
return;
63+
}
64+
const { data: pr } = await github.rest.pulls.get({
65+
owner: context.repo.owner,
66+
repo: context.repo.repo,
67+
pull_number: issue.number
68+
});
69+
core.setOutput('pr_number', issue.number.toString());
70+
core.setOutput('pr_head_sha', pr.head.sha);
71+
1872
- name: Checkout source
1973
uses: actions/checkout@v4
74+
with:
75+
# For issue_comment events, checkout the PR head SHA
76+
ref: ${{ github.event_name == 'issue_comment' && steps.pr-info.outputs.pr_head_sha || github.event.pull_request.head.sha || github.sha }}
77+
# For pull_request events, fetch the PR head
78+
fetch-depth: 0
2079

2180
- name: Check for code changes
2281
uses: dorny/paths-filter@v3
2382
id: filter
83+
continue-on-error: true # Don't fail if paths-filter can't determine changes (e.g., issue_comment without PR context)
2484
with:
2585
filters: |
2686
code:
@@ -30,6 +90,20 @@ jobs:
3090
- '!LICENSE'
3191
- '!OWNERS'
3292
- '!PROJECT'
93+
94+
- name: Set output with default
95+
id: set-output
96+
run: |
97+
# Use filter output if available, otherwise default to 'true' for issue_comment events
98+
# This ensures /trigger-e2e-full works even if PR context is unclear
99+
FILTER_OUTPUT="${{ steps.filter.outputs.code }}"
100+
if [ "${{ github.event_name }}" == "issue_comment" ] && [ -z "$FILTER_OUTPUT" ]; then
101+
echo "has_code_changes=true" >> $GITHUB_OUTPUT
102+
elif [ -n "$FILTER_OUTPUT" ]; then
103+
echo "has_code_changes=$FILTER_OUTPUT" >> $GITHUB_OUTPUT
104+
else
105+
echo "has_code_changes=true" >> $GITHUB_OUTPUT
106+
fi
33107
34108
lint-and-test:
35109
runs-on: ubuntu-latest
@@ -68,28 +142,115 @@ jobs:
68142
run: |
69143
make test
70144
71-
# E2E tests run in parallel with different configurations:
72-
# - HPAScaleToZero feature gate settings (true/false)
73-
# - GPU types (nvidia/amd) for limiter tests
74-
# Skip e2e tests if PR only contains docs/metadata changes
145+
# Check if full e2e tests should run (via workflow_dispatch or comment trigger)
146+
check-full-tests:
147+
runs-on: ubuntu-latest
148+
permissions:
149+
contents: read
150+
pull-requests: write # For posting comments and reactions on PRs
151+
outputs:
152+
run_full: ${{ steps.check.outputs.run_full }}
153+
steps:
154+
- name: Check if full tests requested
155+
id: check
156+
uses: actions/github-script@v7
157+
with:
158+
script: |
159+
// Helper to check if user has write access
160+
async function hasWriteAccess(username) {
161+
try {
162+
const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
163+
owner: context.repo.owner,
164+
repo: context.repo.repo,
165+
username: username
166+
});
167+
const privilegedRoles = ['admin', 'maintain', 'write'];
168+
return privilegedRoles.includes(permission.permission);
169+
} catch (e) {
170+
console.log(`Could not get permissions for ${username}: ${e.message}`);
171+
return false;
172+
}
173+
}
174+
175+
// Check workflow_dispatch input
176+
const workflowInput = '${{ github.event.inputs.run_full_tests }}';
177+
// Handle both boolean and string inputs
178+
if (workflowInput === 'true' || workflowInput === true || workflowInput === 'True') {
179+
core.setOutput('run_full', 'true');
180+
return;
181+
}
182+
183+
// Check for /trigger-e2e-full comment trigger
184+
if (context.eventName === 'issue_comment') {
185+
const comment = context.payload.comment.body.trim();
186+
const issue = context.payload.issue;
187+
188+
// Only process /trigger-e2e-full comments on PRs
189+
if (!issue.pull_request) {
190+
console.log('Comment is not on a PR, skipping');
191+
core.setOutput('run_full', 'false');
192+
return;
193+
}
194+
195+
// Strict command matching (case-sensitive, exact match)
196+
const validCommands = ['/trigger-e2e-full'];
197+
if (!validCommands.includes(comment)) {
198+
console.log(`Comment "${comment}" is not a valid trigger command, skipping`);
199+
core.setOutput('run_full', 'false');
200+
return;
201+
}
202+
203+
// Check if commenter has write access
204+
const commenter = context.payload.comment.user.login;
205+
const hasAccess = await hasWriteAccess(commenter);
206+
if (!hasAccess) {
207+
console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
208+
core.setOutput('run_full', 'false');
209+
return;
210+
}
211+
212+
// Get PR details
213+
const { data: pr } = await github.rest.pulls.get({
214+
owner: context.repo.owner,
215+
repo: context.repo.repo,
216+
pull_number: issue.number
217+
});
218+
219+
console.log(`${comment} approved by ${commenter} for PR #${issue.number}`);
220+
console.log(`PR head SHA: ${pr.head.sha}`);
221+
222+
// Add reaction to acknowledge
223+
await github.rest.reactions.createForIssueComment({
224+
owner: context.repo.owner,
225+
repo: context.repo.repo,
226+
comment_id: context.payload.comment.id,
227+
content: 'rocket'
228+
});
229+
230+
// Post comment with link to the workflow run
231+
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
232+
await github.rest.issues.createComment({
233+
owner: context.repo.owner,
234+
repo: context.repo.repo,
235+
issue_number: issue.number,
236+
body: `🚀 **Full E2E tests triggered by ${comment}**\n\n[View the Kind E2E workflow run](${runUrl})`
237+
});
238+
239+
core.setOutput('run_full', 'true');
240+
return;
241+
}
242+
243+
core.setOutput('run_full', 'false');
244+
245+
# E2E tests - smoke tests run automatically, full tests on approval
246+
# Skip e2e tests if PR only contains docs/metadata changes (unless explicitly triggered via /trigger-e2e-full)
75247
e2e-tests:
76248
runs-on: ubuntu-latest
77-
needs: [lint-and-test, check-code-changes]
78-
if: needs.check-code-changes.outputs.has_code_changes == 'true'
249+
needs: [lint-and-test, check-code-changes, check-full-tests]
250+
if: always() && (needs.check-full-tests.outputs.run_full == 'true' || (needs.check-code-changes.result == 'success' && needs.check-code-changes.outputs.has_code_changes == 'true'))
79251
timeout-minutes: 60
80-
strategy:
81-
fail-fast: false
82-
matrix:
83-
include:
84-
# NVIDIA-focused: H100, A100, AMD MI300X heterogeneous for limiter tests
85-
- gpu_type: nvidia-mix
86-
scale_to_zero: true
87-
e2e_gpu_type: nvidia
88-
# AMD-focused: MI300X, MI250, NVIDIA A100 heterogeneous for limiter tests
89-
# - gpu_type: amd-mix
90-
# scale_to_zero: false
91-
# e2e_gpu_type: amd
92-
name: e2e-tests (gpu=${{ matrix.gpu_type }}, scale-to-zero=${{ matrix.scale_to_zero }})
252+
permissions:
253+
contents: read
93254
steps:
94255
- name: Checkout source
95256
uses: actions/checkout@v4
@@ -119,12 +280,55 @@ jobs:
119280
sudo mv ./kind /usr/local/bin/kind
120281
kind version
121282
122-
- name: Run e2e tests
283+
- name: Set up Docker Buildx
284+
uses: docker/setup-buildx-action@v3
285+
286+
- name: Build WVA image locally
287+
id: build-image
288+
run: |
289+
# Generate unique image tag for this PR run (local image, no registry needed)
290+
IMAGE_NAME="llm-d-workload-variant-autoscaler"
291+
IMAGE_TAG="pr-${GITHUB_RUN_ID}-${GITHUB_SHA:0:7}"
292+
# Use localhost prefix for local-only image (Kind will load it directly)
293+
FULL_IMAGE="localhost/${IMAGE_NAME}:${IMAGE_TAG}"
294+
295+
echo "Building local image: $FULL_IMAGE"
296+
echo "Image will be loaded into Kind cluster (no push needed)"
297+
298+
# Build image locally (no push needed for Kind)
299+
make docker-build IMG="$FULL_IMAGE"
300+
301+
echo "image=$FULL_IMAGE" >> $GITHUB_OUTPUT
302+
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
303+
echo "Image built locally: $FULL_IMAGE"
304+
305+
- name: Determine test type
306+
id: test-type
307+
run: |
308+
if [ "${{ needs.check-full-tests.outputs.run_full }}" == "true" ]; then
309+
echo "test_target=test-e2e-full-with-setup" >> $GITHUB_OUTPUT
310+
echo "test_name=full" >> $GITHUB_OUTPUT
311+
echo "scale_to_zero=true" >> $GITHUB_OUTPUT
312+
echo "delete_cluster=false" >> $GITHUB_OUTPUT
313+
else
314+
echo "test_target=test-e2e-smoke-with-setup" >> $GITHUB_OUTPUT
315+
echo "test_name=smoke" >> $GITHUB_OUTPUT
316+
echo "scale_to_zero=false" >> $GITHUB_OUTPUT
317+
echo "delete_cluster=true" >> $GITHUB_OUTPUT
318+
fi
319+
320+
- name: Run e2e tests (${{ steps.test-type.outputs.test_name }})
123321
shell: bash
124322
env:
125-
ENABLE_SCALE_TO_ZERO: ${{ matrix.scale_to_zero }}
126-
CLUSTER_GPU_TYPE: ${{ matrix.gpu_type }}
127-
E2E_GPU_TYPE: ${{ matrix.e2e_gpu_type }}
128-
MULTI_MODEL_TESTING: "false"
323+
ENVIRONMENT: kind-emulator
324+
USE_SIMULATOR: "true"
325+
SCALE_TO_ZERO_ENABLED: ${{ steps.test-type.outputs.scale_to_zero }}
326+
CREATE_CLUSTER: "true"
327+
INSTALL_GATEWAY_CTRLPLANE: "true"
328+
E2E_TESTS_ENABLED: "true"
329+
IMG: ${{ steps.build-image.outputs.image }}
330+
SKIP_BUILD: "true"
331+
PROMETHEUS_ADAPTER_WAIT: "false"
332+
DELETE_CLUSTER: ${{ steps.test-type.outputs.delete_cluster }}
129333
run: |
130-
make test-e2e
334+
make ${{ steps.test-type.outputs.test_target }}

0 commit comments

Comments
 (0)