chore: address Copilot review comments (round 2)

clubanderson · claude · clubanderson · commit 4dbd01a34db4 · 2025-12-19T20:53:05.000-05:00
CI workflow improvements: - Rename COMMIT_SHA to GIT_REF to clarify that inputs.ref may not be a SHA - Fix HPA_STABILIZATION_SECONDS fallback from 120 to 30 to match input defaults - Update kubectl version comment with 2025-12 verification date - Add comment noting HF_TOKEN is inherited from GITHUB_ENV - Extract orphaned resource cleanup to shell function to reduce duplication Test improvements: - Pin curl image to 8.11.1 instead of 'latest' for reproducible tests - Add HPA selection validation to ensure correct HPA is selected - Fix err variable shadowing (createErr instead of err) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/.github/workflows/ci-e2e-openshift.yaml b/.github/workflows/ci-e2e-openshift.yaml
@@ -104,15 +104,16 @@ jobs:
         env:
           REGISTRY: ghcr.io
           IMAGE_NAME: ${{ github.repository }}
-          # Use inputs.ref (PR head SHA) when available, otherwise fall back to GITHUB_SHA
-          COMMIT_SHA: ${{ inputs.ref || github.sha }}
+          # Use inputs.ref when available, otherwise fall back to GITHUB_SHA
+          # Note: inputs.ref could be a branch, tag, or SHA - we use the first 8 chars for tagging
+          GIT_REF: ${{ inputs.ref || github.sha }}
         run: |
-          # Build image with commit SHA tag for this PR
-          # Use first 8 chars of the commit SHA
-          IMAGE_TAG="sha-${COMMIT_SHA::8}"
+          # Build image with git ref tag for this PR
+          # Use first 8 chars of the git ref
+          IMAGE_TAG="ref-${GIT_REF::8}"
           FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
           echo "Building image: $FULL_IMAGE"
-          echo "Commit SHA: $COMMIT_SHA"
+          echo "Git ref: $GIT_REF"
 
           # Build and push using make targets
           make docker-build IMG="$FULL_IMAGE"
@@ -132,7 +133,7 @@ jobs:
       REQUEST_RATE: ${{ inputs.request_rate || github.event.inputs.request_rate || '20' }}
       NUM_PROMPTS: ${{ inputs.num_prompts || github.event.inputs.num_prompts || '3000' }}
       MAX_NUM_SEQS: ${{ inputs.max_num_seqs || github.event.inputs.max_num_seqs || '1' }}
-      HPA_STABILIZATION_SECONDS: ${{ inputs.hpa_stabilization_seconds || github.event.inputs.hpa_stabilization_seconds || '120' }}
+      HPA_STABILIZATION_SECONDS: ${{ inputs.hpa_stabilization_seconds || github.event.inputs.hpa_stabilization_seconds || '30' }}
       SKIP_CLEANUP: ${{ inputs.skip_cleanup || github.event.inputs.skip_cleanup || 'true' }}
       # Unique release names per run to avoid conflicts with other concurrent runs
       WVA_RELEASE_NAME: wva-e2e-${{ github.run_id }}
@@ -157,8 +158,9 @@ jobs:
       - name: Install tools (kubectl, oc, helm, make)
         run: |
           sudo apt-get update && sudo apt-get install -y make
-          # Install kubectl - use hardcoded stable version for reproducible CI builds
-          # Pinned 2024-12: v1.31.0 is latest stable compatible with OpenShift 4.16+
+          # Install kubectl - use pinned version for reproducible CI builds
+          # Pinned 2025-12: v1.31.0 tested compatible with OpenShift 4.16+
+          # Update this version when upgrading target cluster or during regular dependency reviews
           KUBECTL_VERSION="v1.31.0"
           echo "Installing kubectl version: $KUBECTL_VERSION"
           curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
@@ -218,21 +220,21 @@ jobs:
 
           # Clean up cluster-scoped resources only if their namespace no longer exists
           echo "Checking for orphaned cluster-scoped resources..."
-          for cr in $(kubectl get clusterrole -l app.kubernetes.io/name=workload-variant-autoscaler -o name 2>/dev/null); do
-            ns=$(kubectl get $cr -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-namespace}' 2>/dev/null || echo "")
-            if [ -n "$ns" ] && ! kubectl get namespace "$ns" &>/dev/null; then
-              echo "Removing orphaned $cr (namespace $ns no longer exists)"
-              kubectl delete $cr --ignore-not-found || true
-            fi
-          done
-
-          for crb in $(kubectl get clusterrolebinding -l app.kubernetes.io/name=workload-variant-autoscaler -o name 2>/dev/null); do
-            ns=$(kubectl get $crb -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-namespace}' 2>/dev/null || echo "")
-            if [ -n "$ns" ] && ! kubectl get namespace "$ns" &>/dev/null; then
-              echo "Removing orphaned $crb (namespace $ns no longer exists)"
-              kubectl delete $crb --ignore-not-found || true
-            fi
-          done
+
+          # Helper function to clean up orphaned cluster-scoped resources
+          cleanup_orphaned_resources() {
+            local resource_type="$1"
+            for res in $(kubectl get "$resource_type" -l app.kubernetes.io/name=workload-variant-autoscaler -o name 2>/dev/null); do
+              ns=$(kubectl get "$res" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-namespace}' 2>/dev/null || echo "")
+              if [ -n "$ns" ] && ! kubectl get namespace "$ns" &>/dev/null; then
+                echo "Removing orphaned $res (namespace $ns no longer exists)"
+                kubectl delete "$res" --ignore-not-found || true
+              fi
+            done
+          }
+
+          cleanup_orphaned_resources "clusterrole"
+          cleanup_orphaned_resources "clusterrolebinding"
 
           echo "Orphaned resource cleanup complete"
 
@@ -245,6 +247,7 @@ jobs:
 
       - name: Deploy WVA and llm-d infrastructure
         env:
+          # HF_TOKEN is inherited from GITHUB_ENV (set in 'Get HF token from cluster secret' step)
           ENVIRONMENT: openshift
           INSTALL_GATEWAY_CTRLPLANE: "false"
           E2E_TESTS_ENABLED: "true"
diff --git a/test/e2e-openshift/sharegpt_scaleup_test.go b/test/e2e-openshift/sharegpt_scaleup_test.go
@@ -94,12 +94,18 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
 		Expect(err).NotTo(HaveOccurred(), "Should be able to list HPAs")
 		Expect(hpaList.Items).NotTo(BeEmpty(), "At least one WVA HPA should exist")
 
-		// Use the first HPA found (there should be one per WVA release)
-		hpa := &hpaList.Items[0]
+		// Select the HPA that targets the expected deployment
+		// This validation ensures we pick the correct HPA if multiple WVA releases exist
+		var hpa *autoscalingv2.HorizontalPodAutoscaler
+		for i := range hpaList.Items {
+			if hpaList.Items[i].Spec.ScaleTargetRef.Name == deployment {
+				hpa = &hpaList.Items[i]
+				break
+			}
+		}
+		Expect(hpa).NotTo(BeNil(), "An HPA targeting deployment %s should exist", deployment)
 		hpaName = hpa.Name
-		_, _ = fmt.Fprintf(GinkgoWriter, "Found HPA: %s\n", hpaName)
-
-		Expect(hpa.Spec.ScaleTargetRef.Name).To(Equal(deployment), "HPA should target the correct deployment")
+		_, _ = fmt.Fprintf(GinkgoWriter, "Found HPA: %s (targets %s)\n", hpaName, deployment)
 
 		By("finding vllm-service by label selector")
 		// Use release-specific label selector if WVA_RELEASE_NAME is set
@@ -177,7 +183,7 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
 						RestartPolicy: corev1.RestartPolicyNever,
 						Containers: []corev1.Container{{
 							Name:    "health-check",
-							Image:   "quay.io/curl/curl:latest",
+							Image:   "quay.io/curl/curl:8.11.1",
 							Command: []string{"/bin/sh", "-c"},
 							Args: []string{fmt.Sprintf(`echo "Checking vLLM readiness at %s:8200..." && curl -sf --max-time 10 http://%s:8200/v1/models && echo "vLLM is ready!" && exit 0; echo "vLLM not ready yet"; exit 1`, vllmServiceName, vllmServiceName)},
 						}},
@@ -194,8 +200,8 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
 		time.Sleep(2 * time.Second)
 
 		// Create and wait for health check job
-		_, err := k8sClient.BatchV1().Jobs(llmDNamespace).Create(ctx, healthCheckJob, metav1.CreateOptions{})
-		Expect(err).NotTo(HaveOccurred(), "Should be able to create health check job")
+		_, createErr := k8sClient.BatchV1().Jobs(llmDNamespace).Create(ctx, healthCheckJob, metav1.CreateOptions{})
+		Expect(createErr).NotTo(HaveOccurred(), "Should be able to create health check job")
 
 		Eventually(func(g Gomega) {
 			job, err := k8sClient.BatchV1().Jobs(llmDNamespace).Get(ctx, "vllm-health-check", metav1.GetOptions{})
@@ -599,7 +605,7 @@ exit 0
 					Containers: []corev1.Container{
 						{
 							Name:    "load-generator",
-							Image:   "quay.io/curl/curl:latest",
+							Image:   "quay.io/curl/curl:8.11.1",
 							Command: []string{"/bin/sh", "-c"},
 							Args:    []string{script},
 							Resources: corev1.ResourceRequirements{