@@ -126,20 +126,27 @@ jobs:
126126 if : needs.gate.outputs.should_run == 'true'
127127 env :
128128 MODEL_ID : ${{ github.event.inputs.model_id || 'unsloth/Meta-Llama-3.1-8B' }}
129- ACCELERATOR_TYPE : ${{ github.event.inputs.accelerator_type || 'H100 ' }}
129+ ACCELERATOR_TYPE : ${{ github.event.inputs.accelerator_type || 'A100 ' }}
130130 REQUEST_RATE : ${{ github.event.inputs.request_rate || '20' }}
131131 NUM_PROMPTS : ${{ github.event.inputs.num_prompts || '3000' }}
132132 MAX_NUM_SEQS : ${{ github.event.inputs.max_num_seqs || '1' }}
133133 HPA_STABILIZATION_SECONDS : ${{ github.event.inputs.hpa_stabilization_seconds || '30' }}
134134 SKIP_CLEANUP : ${{ github.event.inputs.skip_cleanup || 'false' }}
135135 # PR-specific namespaces for isolation between concurrent PR tests
136- # llm-d infrastructure (vLLM, gateway, EPP )
136+ # Primary llm-d namespace (Model A1 + A2 )
137137 LLMD_NAMESPACE : llm-d-inference-scheduler-pr-${{ github.event.pull_request.number || github.run_id }}
138- # WVA controller and related resources
138+ # Secondary llm-d namespace (Model B)
139+ LLMD_NAMESPACE_B : llm-d-inference-scheduler-pr-${{ github.event.pull_request.number || github.run_id }}-b
140+ # WVA controller namespace (monitors all models)
139141 WVA_NAMESPACE : llm-d-autoscaler-pr-${{ github.event.pull_request.number || github.run_id }}
140- # Unique release names per run to avoid conflicts with other concurrent runs
142+ # Unique release names per run to avoid conflicts
141143 WVA_RELEASE_NAME : wva-e2e-${{ github.run_id }}
142- LLMD_RELEASE_SUFFIX : e2e-${{ github.run_id }}
144+ # Model A1: Primary deployment in LLMD_NAMESPACE
145+ MODEL_A1_RELEASE : model-a1-${{ github.run_id }}
146+ # Model A2: Secondary deployment in LLMD_NAMESPACE
147+ MODEL_A2_RELEASE : model-a2-${{ github.run_id }}
148+ # Model B: Deployment in LLMD_NAMESPACE_B
149+ MODEL_B_RELEASE : model-b-${{ github.run_id }}
143150 # Use the image built in the previous job
144151 WVA_IMAGE_TAG : ${{ needs.build-image.outputs.image_tag }}
145152 steps :
@@ -281,10 +288,17 @@ jobs:
281288 echo " HF token configuration: ✓"
282289 ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
283290
291+ - name : Create secondary namespace for Model B
292+ run : |
293+ echo "Creating secondary namespace for Model B..."
294+ kubectl create namespace "$LLMD_NAMESPACE_B" --dry-run=client -o yaml | kubectl apply -f -
295+ echo "Secondary namespace $LLMD_NAMESPACE_B created"
296+
284297 - name : Label namespaces for OpenShift monitoring
285298 run : |
286299 echo "Adding openshift.io/user-monitoring label to namespaces for Prometheus scraping..."
287300 kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
301+ kubectl label namespace "$LLMD_NAMESPACE_B" openshift.io/user-monitoring=true --overwrite
288302 kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite
289303 echo "Namespace labels applied"
290304
@@ -293,51 +307,202 @@ jobs:
293307 echo "Waiting for WVA controller to be ready..."
294308 kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" || true
295309 kubectl get pods -n "$WVA_NAMESPACE"
296- echo "Waiting for llm-d deployment to be ready..."
310+ echo "Waiting for llm-d deployment (Model A1) to be ready..."
297311 kubectl get pods -n "$LLMD_NAMESPACE"
298312
299- - name : Patch vLLM deployment for e2e testing
313+ - name : Deploy Model A2 in primary namespace
314+ env :
315+ # Deploy second variant in same namespace, monitored by existing WVA controller
316+ LLMD_NS : ${{ env.LLMD_NAMESPACE }}
317+ WVA_NS : ${{ env.WVA_NAMESPACE }}
318+ run : |
319+ echo "Deploying Model A2 (second variant) in $LLMD_NAMESPACE..."
320+ echo " Release name: $MODEL_A2_RELEASE"
321+
322+ # Create a copy of Model A1's decode deployment for Model A2
323+ # This creates a second vLLM instance in the same namespace
324+ # IMPORTANT: Must update both the name AND the model label to avoid HPA conflicts
325+ MODEL_A2_DEPLOYMENT="model-a2-decode"
326+ SOURCE_DEPLOYMENT="ms-inference-scheduling-llm-d-modelservice-decode"
327+ SOURCE_MODEL_LABEL="ms-inference-scheduling-llm-d-modelservice"
328+ MODEL_A2_LABEL="model-a2"
329+
330+ echo "Creating Model A2 deployment from $SOURCE_DEPLOYMENT..."
331+ echo " Updating model label from $SOURCE_MODEL_LABEL to $MODEL_A2_LABEL"
332+ # Update deployment name and model labels, but preserve serviceAccountName
333+ # The sed for model label must not affect serviceAccount/serviceAccountName fields
334+ kubectl get deployment "$SOURCE_DEPLOYMENT" -n "$LLMD_NAMESPACE" -o yaml | \
335+ sed "s/$SOURCE_DEPLOYMENT/$MODEL_A2_DEPLOYMENT/g" | \
336+ sed "s/llm-d.ai\/model: $SOURCE_MODEL_LABEL/llm-d.ai\/model: $MODEL_A2_LABEL/g" | \
337+ sed 's/replicas: [0-9]*/replicas: 1/' | \
338+ kubectl apply -n "$LLMD_NAMESPACE" -f -
339+
340+ echo "Waiting for Model A2 deployment to be ready..."
341+ kubectl rollout status deployment/"$MODEL_A2_DEPLOYMENT" -n "$LLMD_NAMESPACE" --timeout=300s || true
342+
343+ # Deploy WVA resources (VA, HPA, ServiceMonitor) for Model A2
344+ # controller.enabled=false since we're using the existing WVA controller
345+ # Note: llmd.modelName should be base name without -decode suffix (template appends it)
346+ helm upgrade -i "$MODEL_A2_RELEASE" ./charts/workload-variant-autoscaler \
347+ -n "$WVA_NAMESPACE" \
348+ --set controller.enabled=false \
349+ --set va.enabled=true \
350+ --set hpa.enabled=true \
351+ --set llmd.namespace="$LLMD_NAMESPACE" \
352+ --set llmd.modelName="model-a2" \
353+ --set llmd.modelID="$MODEL_ID" \
354+ --set va.accelerator="$ACCELERATOR_TYPE" \
355+ --set wva.baseName="model-a2" \
356+ --set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring
357+
358+ echo "Model A2 WVA resources deployed"
359+ kubectl get deployment "$MODEL_A2_DEPLOYMENT" -n "$LLMD_NAMESPACE" || true
360+ kubectl get hpa -n "$LLMD_NAMESPACE" -l app.kubernetes.io/instance="$MODEL_A2_RELEASE" || true
361+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE" -l app.kubernetes.io/instance="$MODEL_A2_RELEASE" || true
362+
363+ - name : Deploy Model B infrastructure in secondary namespace
364+ env :
365+ # HF_TOKEN is inherited from GITHUB_ENV
366+ ENVIRONMENT : openshift
367+ INSTALL_GATEWAY_CTRLPLANE : " false"
368+ E2E_TESTS_ENABLED : " true"
369+ NAMESPACE_SCOPED : " false"
370+ # Override namespaces for Model B stack
371+ LLMD_NS : ${{ env.LLMD_NAMESPACE_B }}
372+ WVA_NS : ${{ env.WVA_NAMESPACE }}
373+ # Skip WVA controller and prometheus (use existing)
374+ DEPLOY_WVA : " false"
375+ DEPLOY_PROMETHEUS : " false"
376+ DEPLOY_PROMETHEUS_ADAPTER : " false"
377+ DEPLOY_VA : " false"
378+ DEPLOY_HPA : " false"
379+ run : |
380+ echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
381+ echo " MODEL_ID: $MODEL_ID"
382+ echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
383+
384+ # Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
385+ ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift
386+
387+ echo "Waiting for Model B deployment to be ready..."
388+ kubectl wait --for=condition=available --timeout=300s deployment --all -n "$LLMD_NAMESPACE_B" || true
389+ kubectl get pods -n "$LLMD_NAMESPACE_B"
390+
391+ - name : Deploy Model B WVA resources
392+ env :
393+ LLMD_NS : ${{ env.LLMD_NAMESPACE_B }}
394+ WVA_NS : ${{ env.WVA_NAMESPACE }}
395+ run : |
396+ echo "Deploying Model B WVA resources..."
397+ echo " Release name: $MODEL_B_RELEASE"
398+
399+ # Deploy WVA resources (VA, HPA, ServiceMonitor) for Model B
400+ # controller.enabled=false since we're using the existing WVA controller
401+ # Note: llmd.modelName should be base name without -decode suffix (template appends it)
402+ helm upgrade -i "$MODEL_B_RELEASE" ./charts/workload-variant-autoscaler \
403+ -n "$WVA_NAMESPACE" \
404+ --set controller.enabled=false \
405+ --set va.enabled=true \
406+ --set hpa.enabled=true \
407+ --set llmd.namespace="$LLMD_NAMESPACE_B" \
408+ --set llmd.modelName="ms-inference-scheduling-llm-d-modelservice" \
409+ --set llmd.modelID="$MODEL_ID" \
410+ --set va.accelerator="$ACCELERATOR_TYPE" \
411+ --set wva.baseName="inference-scheduling" \
412+ --set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring
413+
414+ echo "Model B WVA resources deployed"
415+ kubectl get hpa -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
416+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
417+
418+ - name : Verify multi-model deployment
419+ run : |
420+ echo "=== Multi-Model Deployment Status ==="
421+ echo ""
422+ echo "=== Model A1 (Primary, $LLMD_NAMESPACE) ==="
423+ kubectl get deployment -n "$LLMD_NAMESPACE" | grep -E "decode|NAME" || true
424+ kubectl get hpa -n "$LLMD_NAMESPACE" || true
425+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE" || true
426+ echo ""
427+ echo "=== Model B ($LLMD_NAMESPACE_B) ==="
428+ kubectl get deployment -n "$LLMD_NAMESPACE_B" | grep -E "decode|NAME" || true
429+ kubectl get hpa -n "$LLMD_NAMESPACE_B" || true
430+ kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" || true
431+ echo ""
432+ echo "=== WVA Controller ($WVA_NAMESPACE) ==="
433+ kubectl get pods -n "$WVA_NAMESPACE"
434+
435+ - name : Patch vLLM deployments for e2e testing
300436 run : |
301- echo "Patching vLLM decode deployment to limit batch size for scaling test..."
437+ echo "Patching vLLM decode deployments to limit batch size for scaling test..."
302438 echo " MAX_NUM_SEQS: $MAX_NUM_SEQS"
303- DEPLOYMENT_NAME="ms-inference-scheduling-llm-d-modelservice-decode"
304-
305- # Find the vllm container index (container name is typically "vllm")
306- echo "Looking for vllm container in deployment..."
307- kubectl get deployment "$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" \
308- -o jsonpath='{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}'
309-
310- # Try to find container named "vllm", fall back to index 0
311- CONTAINER_INDEX="$(
312- kubectl get deployment "$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" \
313- -o jsonpath='{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}' \
314- | awk '$1 == "vllm" {print NR-1; exit}'
315- )"
316- if [ -z "$CONTAINER_INDEX" ]; then
317- echo "Container 'vllm' not found, using index 0"
318- CONTAINER_INDEX=0
319- fi
320- echo "Using container index: $CONTAINER_INDEX"
321439
322- # Add --max-num-seqs to force scaling under load
323- kubectl patch deployment "$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" --type=json -p="[
324- {\"op\": \"add\", \"path\": \"/spec/template/spec/containers/$CONTAINER_INDEX/args/-\", \"value\": \"--max-num-seqs=$MAX_NUM_SEQS\"}
325- ]"
326- echo "Waiting for patched deployment to roll out..."
327- kubectl rollout status deployment/"$DEPLOYMENT_NAME" -n "$LLMD_NAMESPACE" --timeout=300s
440+ # Function to patch a deployment with --max-num-seqs
441+ patch_deployment() {
442+ local deployment_name=$1
443+ local namespace=$2
444+ echo ""
445+ echo "Patching deployment $deployment_name in $namespace..."
446+
447+ # Find the vllm container index
448+ CONTAINER_INDEX="$(
449+ kubectl get deployment "$deployment_name" -n "$namespace" \
450+ -o jsonpath='{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}' \
451+ | awk '$1 == "vllm" {print NR-1; exit}'
452+ )"
453+ if [ -z "$CONTAINER_INDEX" ]; then
454+ echo " Container 'vllm' not found, using index 0"
455+ CONTAINER_INDEX=0
456+ fi
457+ echo " Using container index: $CONTAINER_INDEX"
458+
459+ # Add --max-num-seqs to force scaling under load
460+ kubectl patch deployment "$deployment_name" -n "$namespace" --type=json -p="[
461+ {\"op\": \"add\", \"path\": \"/spec/template/spec/containers/$CONTAINER_INDEX/args/-\", \"value\": \"--max-num-seqs=$MAX_NUM_SEQS\"}
462+ ]"
463+ echo " Waiting for patched deployment to roll out..."
464+ kubectl rollout status deployment/"$deployment_name" -n "$namespace" --timeout=300s
465+ }
466+
467+ # Patch Model A1 deployment
468+ patch_deployment "ms-inference-scheduling-llm-d-modelservice-decode" "$LLMD_NAMESPACE"
469+
470+ # Patch Model A2 deployment
471+ patch_deployment "model-a2-decode" "$LLMD_NAMESPACE"
472+
473+ # Patch Model B deployment
474+ patch_deployment "ms-inference-scheduling-llm-d-modelservice-decode" "$LLMD_NAMESPACE_B"
328475
329- - name : Patch HPA for faster e2e testing
476+ echo ""
477+ echo "All vLLM deployments patched successfully"
478+
479+ - name : Patch HPAs for faster e2e testing
330480 run : |
331- echo "Patching HPA stabilization window for e2e testing..."
481+ echo "Patching HPA stabilization windows for e2e testing..."
332482 echo " HPA_STABILIZATION_SECONDS: $HPA_STABILIZATION_SECONDS"
333- # Find HPA by label selector (name includes release name)
334- HPA_NAME=$(kubectl get hpa -n "$LLMD_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}')
335- echo " HPA_NAME: $HPA_NAME"
336- kubectl patch hpa "$HPA_NAME" -n "$LLMD_NAMESPACE" --type=json -p='[
337- {"op": "replace", "path": "/spec/behavior/scaleUp/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'},
338- {"op": "replace", "path": "/spec/behavior/scaleDown/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'}
339- ]'
483+
484+ # Function to patch HPAs in a namespace
485+ patch_hpas_in_namespace() {
486+ local namespace=$1
487+ echo ""
488+ echo "Patching HPAs in namespace $namespace..."
489+ for HPA_NAME in $(kubectl get hpa -n "$namespace" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[*].metadata.name}'); do
490+ echo " Patching HPA: $HPA_NAME"
491+ kubectl patch hpa "$HPA_NAME" -n "$namespace" --type=json -p='[
492+ {"op": "replace", "path": "/spec/behavior/scaleUp/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'},
493+ {"op": "replace", "path": "/spec/behavior/scaleDown/stabilizationWindowSeconds", "value": '"$HPA_STABILIZATION_SECONDS"'}
494+ ]' || true
495+ done
496+ }
497+
498+ # Patch HPAs in both namespaces
499+ patch_hpas_in_namespace "$LLMD_NAMESPACE"
500+ patch_hpas_in_namespace "$LLMD_NAMESPACE_B"
501+
502+ echo ""
503+ echo "All HPAs:"
340504 kubectl get hpa -n "$LLMD_NAMESPACE"
505+ kubectl get hpa -n "$LLMD_NAMESPACE_B"
341506
342507 - name : Install Go dependencies
343508 run : go mod download
@@ -347,6 +512,10 @@ jobs:
347512 CONTROLLER_NAMESPACE : ${{ env.WVA_NAMESPACE }}
348513 MONITORING_NAMESPACE : openshift-user-workload-monitoring
349514 LLMD_NAMESPACE : ${{ env.LLMD_NAMESPACE }}
515+ # Multi-model testing: secondary namespace for Model B
516+ LLMD_NAMESPACE_B : ${{ env.LLMD_NAMESPACE_B }}
517+ # Model A2 deployment name
518+ DEPLOYMENT_A2 : model-a2-decode
350519 GATEWAY_NAME : infra-inference-scheduling-inference-gateway-istio
351520 DEPLOYMENT : ms-inference-scheduling-llm-d-modelservice-decode
352521 # Pass WVA_RELEASE_NAME so test can filter for current run's resources
@@ -355,7 +524,9 @@ jobs:
355524 echo "Running OpenShift E2E tests with configuration:"
356525 echo " CONTROLLER_NAMESPACE: $CONTROLLER_NAMESPACE"
357526 echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
527+ echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B (multi-model)"
358528 echo " DEPLOYMENT: $DEPLOYMENT"
529+ echo " DEPLOYMENT_A2: $DEPLOYMENT_A2"
359530 echo " GATEWAY_NAME: $GATEWAY_NAME"
360531 echo " MODEL_ID: $MODEL_ID"
361532 echo " REQUEST_RATE: $REQUEST_RATE"
@@ -368,25 +539,42 @@ jobs:
368539 run : |
369540 echo "Cleaning up ALL test infrastructure..."
370541 echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
542+ echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
371543 echo " WVA_NAMESPACE: $WVA_NAMESPACE"
372544 echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
545+ echo " MODEL_A2_RELEASE: $MODEL_A2_RELEASE"
546+ echo " MODEL_B_RELEASE: $MODEL_B_RELEASE"
373547
374- # Uninstall helm releases before deleting namespaces
548+ # Uninstall all WVA helm releases before deleting namespaces
375549 # This ensures proper cleanup of resources and removes helm tracking
376- echo "Uninstalling WVA helm release ..."
550+ echo "Uninstalling WVA helm releases ..."
377551 helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
552+ helm uninstall "$MODEL_A2_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
553+ helm uninstall "$MODEL_B_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
378554
379- echo "Uninstalling llm-d helm releases..."
380- # List and uninstall all helm releases in the llm-d namespace
555+ echo "Uninstalling llm-d helm releases in primary namespace..."
381556 for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
382557 echo " Uninstalling release: $release"
383558 helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
384559 done
385560
386- # Delete both PR-specific namespaces
561+ echo "Uninstalling llm-d helm releases in secondary namespace..."
562+ for release in $(helm list -n "$LLMD_NAMESPACE_B" -q 2>/dev/null); do
563+ echo " Uninstalling release: $release"
564+ helm uninstall "$release" -n "$LLMD_NAMESPACE_B" --ignore-not-found --wait --timeout 60s || true
565+ done
566+
567+ # Delete Model A2 deployment (not managed by helm)
568+ echo "Deleting Model A2 deployment..."
569+ kubectl delete deployment model-a2-decode -n "$LLMD_NAMESPACE" --ignore-not-found || true
570+
571+ # Delete all PR-specific namespaces
387572 echo "Deleting llm-d namespace $LLMD_NAMESPACE..."
388573 kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s || true
389574
575+ echo "Deleting llm-d namespace $LLMD_NAMESPACE_B..."
576+ kubectl delete namespace "$LLMD_NAMESPACE_B" --ignore-not-found --timeout=120s || true
577+
390578 echo "Deleting WVA namespace $WVA_NAMESPACE..."
391579 kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s || true
392580
0 commit comments