@@ -133,13 +133,20 @@ jobs:
133133 HPA_STABILIZATION_SECONDS : ${{ github.event.inputs.hpa_stabilization_seconds || '30' }}
134134 SKIP_CLEANUP : ${{ github.event.inputs.skip_cleanup || 'false' }}
135135 # PR-specific namespaces for isolation between concurrent PR tests
136- # llm-d infrastructure (vLLM, gateway, EPP )
136+ # Primary llm-d namespace (Model A1 + A2 )
137137 LLMD_NAMESPACE : llm-d-inference-scheduler-pr-${{ github.event.pull_request.number || github.run_id }}
138- # WVA controller and related resources
138+ # Secondary llm-d namespace (Model B)
139+ LLMD_NAMESPACE_B : llm-d-inference-scheduler-pr-${{ github.event.pull_request.number || github.run_id }}-b
140+ # WVA controller namespace (monitors all models)
139141 WVA_NAMESPACE : llm-d-autoscaler-pr-${{ github.event.pull_request.number || github.run_id }}
140- # Unique release names per run to avoid conflicts with other concurrent runs
142+ # Unique release names per run to avoid conflicts
141143 WVA_RELEASE_NAME : wva-e2e-${{ github.run_id }}
142- LLMD_RELEASE_SUFFIX : e2e-${{ github.run_id }}
144+ # Model A1: Primary deployment in LLMD_NAMESPACE
145+ MODEL_A1_RELEASE : model-a1-${{ github.run_id }}
146+ # Model A2: Secondary deployment in LLMD_NAMESPACE
147+ MODEL_A2_RELEASE : model-a2-${{ github.run_id }}
148+ # Model B: Deployment in LLMD_NAMESPACE_B
149+ MODEL_B_RELEASE : model-b-${{ github.run_id }}
143150 # Use the image built in the previous job
144151 WVA_IMAGE_TAG : ${{ needs.build-image.outputs.image_tag }}
145152 steps :
@@ -281,10 +288,17 @@ jobs:
281288 echo " HF token configuration: ✓"
282289 ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
283290
291+ - name : Create secondary namespace for Model B
292+ run : |
293+ echo "Creating secondary namespace for Model B..."
294+ kubectl create namespace "$LLMD_NAMESPACE_B" --dry-run=client -o yaml | kubectl apply -f -
295+ echo "Secondary namespace $LLMD_NAMESPACE_B created"
296+
284297 - name : Label namespaces for OpenShift monitoring
285298 run : |
286299 echo "Adding openshift.io/user-monitoring label to namespaces for Prometheus scraping..."
287300 kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
301+ kubectl label namespace "$LLMD_NAMESPACE_B" openshift.io/user-monitoring=true --overwrite
288302 kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite
289303 echo "Namespace labels applied"
290304
@@ -293,51 +307,193 @@ jobs:
293307 echo "Waiting for WVA controller to be ready..."
294308 kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" || true
295309 kubectl get pods -n "$WVA_NAMESPACE"
296- echo "Waiting for llm-d deployment to be ready..."
310+ echo "Waiting for llm-d deployment (Model A1) to be ready..."
297311 kubectl get pods -n "$LLMD_NAMESPACE"
298312
299- - name : Patch vLLM deployment for e2e testing
313+ - name : Deploy Model A2 in primary namespace
314+ env :
315+ # Deploy second variant in same namespace, monitored by existing WVA controller
316+ LLMD_NS : ${{ env.LLMD_NAMESPACE }}
317+ WVA_NS : ${{ env.WVA_NAMESPACE }}
318+ run : |
319+ echo "Deploying Model A2 (second variant) in $LLMD_NAMESPACE..."
320+ echo " Release name: $MODEL_A2_RELEASE"
321+
322+ # Create a copy of Model A1's decode deployment for Model A2
323+ # This creates a second vLLM instance in the same namespace
324+ MODEL_A2_DEPLOYMENT="model-a2-decode"
325+ SOURCE_DEPLOYMENT="ms-inference-scheduling-llm-d-modelservice-decode"
326+
327+ echo "Creating Model A2 deployment from $SOURCE_DEPLOYMENT..."
328+ kubectl get deployment "$SOURCE_DEPLOYMENT" -n "$LLMD_NAMESPACE" -o yaml | \
329+ sed "s/$SOURCE_DEPLOYMENT/$MODEL_A2_DEPLOYMENT/g" | \
330+ sed 's/replicas: [0-9]*/replicas: 1/' | \
331+ kubectl apply -n "$LLMD_NAMESPACE" -f -
332+
333+ echo "Waiting for Model A2 deployment to be ready..."
334+ kubectl rollout status deployment/"$MODEL_A2_DEPLOYMENT" -n "$LLMD_NAMESPACE" --timeout=300s || true
335+
336+ # Deploy WVA resources (VA, HPA, ServiceMonitor) for Model A2
337+ # controller.enabled=false since we're using the existing WVA controller
338+ helm upgrade -i "$MODEL_A2_RELEASE" ./charts/workload-variant-autoscaler \
339+ -n "$WVA_NAMESPACE" \
340+ --set controller.enabled=false \
341+ --set va.enabled=true \
342+ --set hpa.enabled=true \
343+ --set llmd.namespace="$LLMD_NAMESPACE" \
344+ --set llmd.modelName="$MODEL_A2_DEPLOYMENT" \
345+ --set llmd.modelID="$MODEL_ID" \
346+ --set va.accelerator="$ACCELERATOR_TYPE" \
347+ --set wva.baseName="model-a2" \
348+ --set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring
349+
350+ echo "Model A2 WVA resources deployed"
351+ kubectl get deployment "$MODEL_A2_DEPLOYMENT" -n "$LLMD_NAMESPACE" || true
352+ kubectl get hpa -n "$LLMD_NAMESPACE" -l app.kubernetes.io/instance="$MODEL_A2_RELEASE" || true
353+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE" -l app.kubernetes.io/instance="$MODEL_A2_RELEASE" || true
354+
355+ - name : Deploy Model B infrastructure in secondary namespace
356+ env :
357+ # HF_TOKEN is inherited from GITHUB_ENV
358+ ENVIRONMENT : openshift
359+ INSTALL_GATEWAY_CTRLPLANE : " false"
360+ E2E_TESTS_ENABLED : " true"
361+ NAMESPACE_SCOPED : " false"
362+ # Override namespaces for Model B stack
363+ LLMD_NS : ${{ env.LLMD_NAMESPACE_B }}
364+ WVA_NS : ${{ env.WVA_NAMESPACE }}
365+ # Skip WVA controller and prometheus (use existing)
366+ DEPLOY_WVA : " false"
367+ DEPLOY_PROMETHEUS : " false"
368+ DEPLOY_PROMETHEUS_ADAPTER : " false"
369+ DEPLOY_VA : " false"
370+ DEPLOY_HPA : " false"
371+ run : |
372+ echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
373+ echo " MODEL_ID: $MODEL_ID"
374+ echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
375+
376+ # Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
377+ ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift
378+
379+ echo "Waiting for Model B deployment to be ready..."
380+ kubectl wait --for=condition=available --timeout=300s deployment --all -n "$LLMD_NAMESPACE_B" || true
381+ kubectl get pods -n "$LLMD_NAMESPACE_B"
382+
383+ - name : Deploy Model B WVA resources
384+ env :
385+ LLMD_NS : ${{ env.LLMD_NAMESPACE_B }}
386+ WVA_NS : ${{ env.WVA_NAMESPACE }}
387+ run : |
388+ echo "Deploying Model B WVA resources..."
389+ echo " Release name: $MODEL_B_RELEASE"
390+
391+ # Deploy WVA resources (VA, HPA, ServiceMonitor) for Model B
392+ # controller.enabled=false since we're using the existing WVA controller
393+ helm upgrade -i "$MODEL_B_RELEASE" ./charts/workload-variant-autoscaler \
394+ -n "$WVA_NAMESPACE" \
395+ --set controller.enabled=false \
396+ --set va.enabled=true \
397+ --set hpa.enabled=true \
398+ --set llmd.namespace="$LLMD_NAMESPACE_B" \
399+ --set llmd.modelName="ms-inference-scheduling-llm-d-modelservice-decode" \
400+ --set llmd.modelID="$MODEL_ID" \
401+ --set va.accelerator="$ACCELERATOR_TYPE" \
402+ --set wva.baseName="inference-scheduling" \
403+ --set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring
404+
405+ echo "Model B WVA resources deployed"
406+ kubectl get hpa -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
407+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
408+
409+ - name : Verify multi-model deployment
410+ run : |
411+ echo "=== Multi-Model Deployment Status ==="
412+ echo ""
413+ echo "=== Model A1 (Primary, $LLMD_NAMESPACE) ==="
414+ kubectl get deployment -n "$LLMD_NAMESPACE" | grep -E "decode|NAME" || true
415+ kubectl get hpa -n "$LLMD_NAMESPACE" || true
416+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE" || true
417+ echo ""
418+ echo "=== Model B ($LLMD_NAMESPACE_B) ==="
419+ kubectl get deployment -n "$LLMD_NAMESPACE_B" | grep -E "decode|NAME" || true
420+ kubectl get hpa -n "$LLMD_NAMESPACE_B" || true
421+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" || true
422+ echo ""
423+ echo "=== WVA Controller ($WVA_NAMESPACE) ==="
424+ kubectl get pods -n "$WVA_NAMESPACE"
425+
426+ - name : Patch vLLM deployments for e2e testing
300427 run : |
301- echo "Patching vLLM decode deployment to limit batch size for scaling test..."
428+ echo "Patching vLLM decode deployments to limit batch size for scaling test..."
302429 echo " MAX_NUM_SEQS: $MAX_NUM_SEQS"
303- DEPLOYMENT_NAME="ms-inference-scheduling-llm-d-modelservice-decode"
304-
305- # Find the vllm container index (container name is typically "vllm")
306- echo "Looking for vllm container in deployment..."
307- kubectl get deployment "$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" \
308- -o jsonpath='{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}'
309-
310- # Try to find container named "vllm", fall back to index 0
311- CONTAINER_INDEX="$(
312- kubectl get deployment "$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" \
313- -o jsonpath='{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}' \
314- | awk '$1 == "vllm" {print NR-1; exit}'
315- )"
316- if [ -z "$CONTAINER_INDEX" ]; then
317- echo "Container 'vllm' not found, using index 0"
318- CONTAINER_INDEX=0
319- fi
320- echo "Using container index: $CONTAINER_INDEX"
321430
322- # Add --max-num-seqs to force scaling under load
323- kubectl patch deployment "$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" --type=json -p="[
324- {\"op\": \"add\", \"path\": \"/spec/template/spec/containers/$CONTAINER_INDEX/args/-\", \"value\": \"--max-num-seqs=$MAX_NUM_SEQS\"}
325- ]"
326- echo "Waiting for patched deployment to roll out..."
327- kubectl rollout status deployment/"$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" --timeout=300s
431+ # Function to patch a deployment with --max-num-seqs
432+ patch_deployment() {
433+ local deployment_name=$1
434+ local namespace=$2
435+ echo ""
436+ echo "Patching deployment $deployment_name in $namespace..."
437+
438+ # Find the vllm container index
439+ CONTAINER_INDEX="$(
440+ kubectl get deployment "$deployment_name" -n "$namespace" \
441+ -o jsonpath='{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}' \
442+ | awk '$1 == "vllm" {print NR-1; exit}'
443+ )"
444+ if [ -z "$CONTAINER_INDEX" ]; then
445+ echo " Container 'vllm' not found, using index 0"
446+ CONTAINER_INDEX=0
447+ fi
448+ echo " Using container index: $CONTAINER_INDEX"
449+
450+ # Add --max-num-seqs to force scaling under load
451+ kubectl patch deployment "$deployment_name" -n "$namespace" --type=json -p="[
452+ {\"op\": \"add\", \"path\": \"/spec/template/spec/containers/$CONTAINER_INDEX/args/-\", \"value\": \"--max-num-seqs=$MAX_NUM_SEQS\"}
453+ ]"
454+ echo " Waiting for patched deployment to roll out..."
455+ kubectl rollout status deployment/"$deployment_name" -n "$namespace" --timeout=300s
456+ }
457+
458+ # Patch Model A1 deployment
459+ patch_deployment "ms-inference-scheduling-llm-d-modelservice-decode" "$LLMD_NAMESPACE"
460+
461+ # Patch Model A2 deployment
462+ patch_deployment "model-a2-decode" "$LLMD_NAMESPACE"
463+
464+ # Patch Model B deployment
465+ patch_deployment "ms-inference-scheduling-llm-d-modelservice-decode" "$LLMD_NAMESPACE_B"
328466
329- - name : Patch HPA for faster e2e testing
467+ echo ""
468+ echo "All vLLM deployments patched successfully"
469+
470+ - name : Patch HPAs for faster e2e testing
330471 run : |
331- echo "Patching HPA stabilization window for e2e testing..."
472+ echo "Patching HPA stabilization windows for e2e testing..."
332473 echo " HPA_STABILIZATION_SECONDS: $HPA_STABILIZATION_SECONDS"
333- # Find HPA by label selector (name includes release name)
334- HPA_NAME=$(kubectl get hpa -n "$LLMD_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}')
335- echo " HPA_NAME: $HPA_NAME"
336- kubectl patch hpa "$HPA_NAME" -n "$LLMD_NAMESPACE" --type=json -p='[
337- {"op": "replace", "path": "/spec/behavior/scaleUp/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'},
338- {"op": "replace", "path": "/spec/behavior/scaleDown/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'}
339- ]'
474+
475+ # Function to patch HPAs in a namespace
476+ patch_hpas_in_namespace() {
477+ local namespace=$1
478+ echo ""
479+ echo "Patching HPAs in namespace $namespace..."
480+ for HPA_NAME in $(kubectl get hpa -n "$namespace" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[*].metadata.name}'); do
481+ echo " Patching HPA: $HPA_NAME"
482+ kubectl patch hpa "$HPA_NAME" -n "$namespace" --type=json -p='[
483+ {"op": "replace", "path": "/spec/behavior/scaleUp/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'},
484+ {"op": "replace", "path": "/spec/behavior/scaleDown/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'}
485+ ]' || true
486+ done
487+ }
488+
489+ # Patch HPAs in both namespaces
490+ patch_hpas_in_namespace "$LLMD_NAMESPACE"
491+ patch_hpas_in_namespace "$LLMD_NAMESPACE_B"
492+
493+ echo ""
494+ echo "All HPAs:"
340495 kubectl get hpa -n "$LLMD_NAMESPACE"
496+ kubectl get hpa -n "$LLMD_NAMESPACE_B"
341497
342498 - name : Install Go dependencies
343499 run : go mod download
@@ -347,6 +503,10 @@ jobs:
347503 CONTROLLER_NAMESPACE : ${{ env.WVA_NAMESPACE }}
348504 MONITORING_NAMESPACE : openshift-user-workload-monitoring
349505 LLMD_NAMESPACE : ${{ env.LLMD_NAMESPACE }}
506+ # Multi-model testing: secondary namespace for Model B
507+ LLMD_NAMESPACE_B : ${{ env.LLMD_NAMESPACE_B }}
508+ # Model A2 deployment name
509+ DEPLOYMENT_A2 : model-a2-decode
350510 GATEWAY_NAME : infra-inference-scheduling-inference-gateway-istio
351511 DEPLOYMENT : ms-inference-scheduling-llm-d-modelservice-decode
352512 # Pass WVA_RELEASE_NAME so test can filter for current run's resources
@@ -355,7 +515,9 @@ jobs:
355515 echo "Running OpenShift E2E tests with configuration:"
356516 echo " CONTROLLER_NAMESPACE: $CONTROLLER_NAMESPACE"
357517 echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
518+ echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B (multi-model)"
358519 echo " DEPLOYMENT: $DEPLOYMENT"
520+ echo " DEPLOYMENT_A2: $DEPLOYMENT_A2"
359521 echo " GATEWAY_NAME: $GATEWAY_NAME"
360522 echo " MODEL_ID: $MODEL_ID"
361523 echo " REQUEST_RATE: $REQUEST_RATE"
@@ -368,25 +530,42 @@ jobs:
368530 run : |
369531 echo "Cleaning up ALL test infrastructure..."
370532 echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
533+ echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
371534 echo " WVA_NAMESPACE: $WVA_NAMESPACE"
372535 echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
536+ echo " MODEL_A2_RELEASE: $MODEL_A2_RELEASE"
537+ echo " MODEL_B_RELEASE: $MODEL_B_RELEASE"
373538
374- # Uninstall helm releases before deleting namespaces
539+ # Uninstall all WVA helm releases before deleting namespaces
375540 # This ensures proper cleanup of resources and removes helm tracking
376- echo "Uninstalling WVA helm release ..."
541+ echo "Uninstalling WVA helm releases ..."
377542 helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
543+ helm uninstall "$MODEL_A2_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
544+ helm uninstall "$MODEL_B_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
378545
379- echo "Uninstalling llm-d helm releases..."
380- # List and uninstall all helm releases in the llm-d namespace
546+ echo "Uninstalling llm-d helm releases in primary namespace..."
381547 for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
382548 echo " Uninstalling release: $release"
383549 helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
384550 done
385551
386- # Delete both PR-specific namespaces
552+ echo "Uninstalling llm-d helm releases in secondary namespace..."
553+ for release in $(helm list -n "$LLMD_NAMESPACE_B" -q 2>/dev/null); do
554+ echo " Uninstalling release: $release"
555+ helm uninstall "$release" -n "$LLMD_NAMESPACE_B" --ignore-not-found --wait --timeout 60s || true
556+ done
557+
558+ # Delete Model A2 deployment (not managed by helm)
559+ echo "Deleting Model A2 deployment..."
560+ kubectl delete deployment model-a2-decode -n "$LLMD_NAMESPACE" --ignore-not-found || true
561+
562+ # Delete all PR-specific namespaces
387563 echo "Deleting llm-d namespace $LLMD_NAMESPACE..."
388564 kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s || true
389565
566+ echo "Deleting llm-d namespace $LLMD_NAMESPACE_B..."
567+ kubectl delete namespace "$LLMD_NAMESPACE_B" --ignore-not-found --timeout=120s || true
568+
390569 echo "Deleting WVA namespace $WVA_NAMESPACE..."
391570 kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s || true
392571
0 commit comments