Skip to content

Commit 4dbd01a

Browse files
clubandersonclaude
andcommitted
chore: address Copilot review comments (round 2)
CI workflow improvements: - Rename COMMIT_SHA to GIT_REF to clarify that inputs.ref may not be a SHA - Fix HPA_STABILIZATION_SECONDS fallback from 120 to 30 to match input defaults - Update kubectl version comment with 2025-12 verification date - Add comment noting HF_TOKEN is inherited from GITHUB_ENV - Extract orphaned resource cleanup to shell function to reduce duplication Test improvements: - Pin curl image to 8.11.1 instead of 'latest' for reproducible tests - Add HPA selection validation to ensure correct HPA is selected - Fix err variable shadowing (createErr instead of err) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent cd2b473 commit 4dbd01a

File tree

2 files changed

+42
-33
lines changed

2 files changed

+42
-33
lines changed

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -104,15 +104,16 @@ jobs:
104104
env:
105105
REGISTRY: ghcr.io
106106
IMAGE_NAME: ${{ github.repository }}
107-
# Use inputs.ref (PR head SHA) when available, otherwise fall back to GITHUB_SHA
108-
COMMIT_SHA: ${{ inputs.ref || github.sha }}
107+
# Use inputs.ref when available, otherwise fall back to GITHUB_SHA
108+
# Note: inputs.ref could be a branch, tag, or SHA - we use the first 8 chars for tagging
109+
GIT_REF: ${{ inputs.ref || github.sha }}
109110
run: |
110-
# Build image with commit SHA tag for this PR
111-
# Use first 8 chars of the commit SHA
112-
IMAGE_TAG="sha-${COMMIT_SHA::8}"
111+
# Build image with git ref tag for this PR
112+
# Use first 8 chars of the git ref
113+
IMAGE_TAG="ref-${GIT_REF::8}"
113114
FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
114115
echo "Building image: $FULL_IMAGE"
115-
echo "Commit SHA: $COMMIT_SHA"
116+
echo "Git ref: $GIT_REF"
116117
117118
# Build and push using make targets
118119
make docker-build IMG="$FULL_IMAGE"
@@ -132,7 +133,7 @@ jobs:
132133
REQUEST_RATE: ${{ inputs.request_rate || github.event.inputs.request_rate || '20' }}
133134
NUM_PROMPTS: ${{ inputs.num_prompts || github.event.inputs.num_prompts || '3000' }}
134135
MAX_NUM_SEQS: ${{ inputs.max_num_seqs || github.event.inputs.max_num_seqs || '1' }}
135-
HPA_STABILIZATION_SECONDS: ${{ inputs.hpa_stabilization_seconds || github.event.inputs.hpa_stabilization_seconds || '120' }}
136+
HPA_STABILIZATION_SECONDS: ${{ inputs.hpa_stabilization_seconds || github.event.inputs.hpa_stabilization_seconds || '30' }}
136137
SKIP_CLEANUP: ${{ inputs.skip_cleanup || github.event.inputs.skip_cleanup || 'true' }}
137138
# Unique release names per run to avoid conflicts with other concurrent runs
138139
WVA_RELEASE_NAME: wva-e2e-${{ github.run_id }}
@@ -157,8 +158,9 @@ jobs:
157158
- name: Install tools (kubectl, oc, helm, make)
158159
run: |
159160
sudo apt-get update && sudo apt-get install -y make
160-
# Install kubectl - use hardcoded stable version for reproducible CI builds
161-
# Pinned 2024-12: v1.31.0 is latest stable compatible with OpenShift 4.16+
161+
# Install kubectl - use pinned version for reproducible CI builds
162+
# Pinned 2025-12: v1.31.0 tested compatible with OpenShift 4.16+
163+
# Update this version when upgrading target cluster or during regular dependency reviews
162164
KUBECTL_VERSION="v1.31.0"
163165
echo "Installing kubectl version: $KUBECTL_VERSION"
164166
curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
@@ -218,21 +220,21 @@ jobs:
218220
219221
# Clean up cluster-scoped resources only if their namespace no longer exists
220222
echo "Checking for orphaned cluster-scoped resources..."
221-
for cr in $(kubectl get clusterrole -l app.kubernetes.io/name=workload-variant-autoscaler -o name 2>/dev/null); do
222-
ns=$(kubectl get $cr -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-namespace}' 2>/dev/null || echo "")
223-
if [ -n "$ns" ] && ! kubectl get namespace "$ns" &>/dev/null; then
224-
echo "Removing orphaned $cr (namespace $ns no longer exists)"
225-
kubectl delete $cr --ignore-not-found || true
226-
fi
227-
done
228-
229-
for crb in $(kubectl get clusterrolebinding -l app.kubernetes.io/name=workload-variant-autoscaler -o name 2>/dev/null); do
230-
ns=$(kubectl get $crb -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-namespace}' 2>/dev/null || echo "")
231-
if [ -n "$ns" ] && ! kubectl get namespace "$ns" &>/dev/null; then
232-
echo "Removing orphaned $crb (namespace $ns no longer exists)"
233-
kubectl delete $crb --ignore-not-found || true
234-
fi
235-
done
223+
224+
# Helper function to clean up orphaned cluster-scoped resources
225+
cleanup_orphaned_resources() {
226+
local resource_type="$1"
227+
for res in $(kubectl get "$resource_type" -l app.kubernetes.io/name=workload-variant-autoscaler -o name 2>/dev/null); do
228+
ns=$(kubectl get "$res" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-namespace}' 2>/dev/null || echo "")
229+
if [ -n "$ns" ] && ! kubectl get namespace "$ns" &>/dev/null; then
230+
echo "Removing orphaned $res (namespace $ns no longer exists)"
231+
kubectl delete "$res" --ignore-not-found || true
232+
fi
233+
done
234+
}
235+
236+
cleanup_orphaned_resources "clusterrole"
237+
cleanup_orphaned_resources "clusterrolebinding"
236238
237239
echo "Orphaned resource cleanup complete"
238240
@@ -245,6 +247,7 @@ jobs:
245247
246248
- name: Deploy WVA and llm-d infrastructure
247249
env:
250+
# HF_TOKEN is inherited from GITHUB_ENV (set in 'Get HF token from cluster secret' step)
248251
ENVIRONMENT: openshift
249252
INSTALL_GATEWAY_CTRLPLANE: "false"
250253
E2E_TESTS_ENABLED: "true"

test/e2e-openshift/sharegpt_scaleup_test.go

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,18 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
9494
Expect(err).NotTo(HaveOccurred(), "Should be able to list HPAs")
9595
Expect(hpaList.Items).NotTo(BeEmpty(), "At least one WVA HPA should exist")
9696

97-
// Use the first HPA found (there should be one per WVA release)
98-
hpa := &hpaList.Items[0]
97+
// Select the HPA that targets the expected deployment
98+
// This validation ensures we pick the correct HPA if multiple WVA releases exist
99+
var hpa *autoscalingv2.HorizontalPodAutoscaler
100+
for i := range hpaList.Items {
101+
if hpaList.Items[i].Spec.ScaleTargetRef.Name == deployment {
102+
hpa = &hpaList.Items[i]
103+
break
104+
}
105+
}
106+
Expect(hpa).NotTo(BeNil(), "An HPA targeting deployment %s should exist", deployment)
99107
hpaName = hpa.Name
100-
_, _ = fmt.Fprintf(GinkgoWriter, "Found HPA: %s\n", hpaName)
101-
102-
Expect(hpa.Spec.ScaleTargetRef.Name).To(Equal(deployment), "HPA should target the correct deployment")
108+
_, _ = fmt.Fprintf(GinkgoWriter, "Found HPA: %s (targets %s)\n", hpaName, deployment)
103109

104110
By("finding vllm-service by label selector")
105111
// Use release-specific label selector if WVA_RELEASE_NAME is set
@@ -177,7 +183,7 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
177183
RestartPolicy: corev1.RestartPolicyNever,
178184
Containers: []corev1.Container{{
179185
Name: "health-check",
180-
Image: "quay.io/curl/curl:latest",
186+
Image: "quay.io/curl/curl:8.11.1",
181187
Command: []string{"/bin/sh", "-c"},
182188
Args: []string{fmt.Sprintf(`echo "Checking vLLM readiness at %s:8200..." && curl -sf --max-time 10 http://%s:8200/v1/models && echo "vLLM is ready!" && exit 0; echo "vLLM not ready yet"; exit 1`, vllmServiceName, vllmServiceName)},
183189
}},
@@ -194,8 +200,8 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
194200
time.Sleep(2 * time.Second)
195201

196202
// Create and wait for health check job
197-
_, err := k8sClient.BatchV1().Jobs(llmDNamespace).Create(ctx, healthCheckJob, metav1.CreateOptions{})
198-
Expect(err).NotTo(HaveOccurred(), "Should be able to create health check job")
203+
_, createErr := k8sClient.BatchV1().Jobs(llmDNamespace).Create(ctx, healthCheckJob, metav1.CreateOptions{})
204+
Expect(createErr).NotTo(HaveOccurred(), "Should be able to create health check job")
199205

200206
Eventually(func(g Gomega) {
201207
job, err := k8sClient.BatchV1().Jobs(llmDNamespace).Get(ctx, "vllm-health-check", metav1.GetOptions{})
@@ -599,7 +605,7 @@ exit 0
599605
Containers: []corev1.Container{
600606
{
601607
Name: "load-generator",
602-
Image: "quay.io/curl/curl:latest",
608+
Image: "quay.io/curl/curl:8.11.1",
603609
Command: []string{"/bin/sh", "-c"},
604610
Args: []string{script},
605611
Resources: corev1.ResourceRequirements{

0 commit comments

Comments
 (0)