From e49111edc7f1e6eb7869ac64ae6122076f24784f Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Tue, 30 Sep 2025 20:42:20 -0400 Subject: [PATCH 01/10] Make the check-container-tool target more robust Signed-off-by: Hannah DeFazio --- Makefile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 6c742ff5..d005172e 100644 --- a/Makefile +++ b/Makefile @@ -293,9 +293,16 @@ check-envsubst: .PHONY: check-container-tool check-container-tool: - @command -v $(CONTAINER_TOOL) >/dev/null 2>&1 || { \ - echo "❌ $(CONTAINER_TOOL) is not installed."; \ - echo "🔧 Try: sudo apt install $(CONTAINER_TOOL) OR brew install $(CONTAINER_TOOL)"; exit 1; } + @if [ -z "$(CONTAINER_TOOL)" ]; then \ + echo "❌ Error: No container tool detected. Please install docker or podman."; \ + exit 1; \ + elif ! command -v $(CONTAINER_TOOL) >/dev/null 2>&1; then \ + echo "❌ Error: '$(CONTAINER_TOOL)' is not installed or not in your PATH."; \ + echo "🔧 Try: sudo apt install $(CONTAINER_TOOL) OR brew install $(CONTAINER_TOOL)"; \ + exit 1; \ + else \ + echo "✅ Container tool '$(CONTAINER_TOOL)' found."; \ + fi .PHONY: check-kubectl check-kubectl: From a7bcad2bb18af581f3188532926e91f4a3ff0022 Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Tue, 30 Sep 2025 20:52:33 -0400 Subject: [PATCH 02/10] Use the CONTAINER_TOOL env var for running the e2e test script Signed-off-by: Hannah DeFazio --- Makefile | 1 + test/scripts/run_e2e.sh | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index d005172e..41fd228f 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,7 @@ TYPOS_ARCH = $(TYPOS_TARGET_ARCH)-unknown-linux-musl endif CONTAINER_TOOL := $(shell { command -v docker >/dev/null 2>&1 && echo docker; } || { command -v podman >/dev/null 2>&1 && echo podman; } || echo "") +export CONTAINER_TOOL BUILDER := $(shell command -v buildah >/dev/null 2>&1 && echo buildah || echo $(CONTAINER_TOOL)) PLATFORMS ?= linux/amd64 # linux/arm64 # linux/s390x,linux/ppc64le diff --git a/test/scripts/run_e2e.sh b/test/scripts/run_e2e.sh index add66dc3..fe13508a 100755 --- a/test/scripts/run_e2e.sh +++ b/test/scripts/run_e2e.sh @@ -1,5 +1,9 @@ #!/bin/bash +# Use the CONTAINER_TOOL from the environment, or default to docker if it's not set. +CONTAINER_TOOL="${CONTAINER_TOOL:-docker}" +echo "Using container tool: ${CONTAINER_TOOL}" + # Set a default EPP_TAG if not provided export EPP_TAG="${EPP_TAG:-dev}" @@ -9,27 +13,27 @@ export VLLM_SIMULATOR_TAG="${VLLM_SIMULATOR_TAG:-v0.4.0}" # Set the default routing side car image tag export ROUTING_SIDECAR_TAG="${ROUTING_SIDECAR_TAG:-v0.2.0}" -SIMTAG=$(docker images | grep ghcr.io/llm-d/llm-d-inference-sim | awk '{print $2}' | grep ${VLLM_SIMULATOR_TAG}) +SIMTAG=$(${CONTAINER_TOOL} images | grep ghcr.io/llm-d/llm-d-inference-sim | awk '{print $2}' | grep ${VLLM_SIMULATOR_TAG}) if [[ "${SIMTAG}" != "${VLLM_SIMULATOR_TAG}" ]]; then - docker pull ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG} + ${CONTAINER_TOOL} pull ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG} if [[ $? != 0 ]]; then echo "Failed to pull ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG}" exit 1 fi fi -EPPTAG=$(docker images | grep ghcr.io/llm-d/llm-d-inference-scheduler | awk '{print $2}' | grep ${EPP_TAG}) +EPPTAG=$(${CONTAINER_TOOL} images | grep ghcr.io/llm-d/llm-d-inference-scheduler | awk '{print $2}' | grep ${EPP_TAG}) if [[ "${EPPTAG}" != "${EPP_TAG}" ]]; then - docker pull ghcr.io/llm-d/llm-d-inference-scheduler:${EPP_TAG} + ${CONTAINER_TOOL} pull ghcr.io/llm-d/llm-d-inference-scheduler:${EPP_TAG} if [[ $? != 0 ]]; then echo "Failed to pull ghcr.io/llm-d/llm-d-inference-scheduler:${EPP_TAG}" exit 1 fi fi -SIDECARTAG=$(docker images | grep ghcr.io/llm-d/llm-d-routing-sidecar | awk '{print $2}' | grep ${ROUTING_SIDECAR_TAG}) +SIDECARTAG=$(${CONTAINER_TOOL} images | grep ghcr.io/llm-d/llm-d-routing-sidecar | awk '{print $2}' | grep ${ROUTING_SIDECAR_TAG}) if [[ "${SIDECARTAG}" != "${ROUTING_SIDECAR_TAG}" ]]; then - docker pull ghcr.io/llm-d/llm-d-routing-sidecar:${ROUTING_SIDECAR_TAG} + ${CONTAINER_TOOL} pull ghcr.io/llm-d/llm-d-routing-sidecar:${ROUTING_SIDECAR_TAG} if [[ $? != 0 ]]; then echo "Failed to pull ghcr.io/llm-d/llm-d-routing-sidecar:${ROUTING_SIDECAR_TAG}" exit 1 From 8f620c5a332b6c8c89905708580cb3d270f24c17 Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Mon, 6 Oct 2025 14:41:54 -0400 Subject: [PATCH 03/10] Set the full docker image paths via env vars --- Makefile | 47 +++++++++++-------- .../inference-gateway/deployments.yaml | 2 +- .../inference-gateway/kustomization.yaml | 4 -- .../components/vllm-sim-pd/deployments.yaml | 6 +-- .../components/vllm-sim-pd/kustomization.yaml | 6 --- deploy/components/vllm-sim/deployments.yaml | 2 +- deploy/components/vllm-sim/kustomization.yaml | 4 -- .../patch-deployments.yaml | 2 +- .../common/patch-statefulset.yaml | 2 +- .../openshift-base/kustomization.yaml | 6 --- scripts/kind-dev-env.sh | 27 +++++------ scripts/kubernetes-dev-env.sh | 8 +--- scripts/pull_images.sh | 36 ++++++++++++++ test/e2e/e2e_suite_test.go | 15 +++--- test/e2e/e2e_test.go | 20 ++++---- test/e2e/yaml/deployments.yaml | 2 +- test/e2e/yaml/vllm-sim-pd.yaml | 6 +-- test/e2e/yaml/vllm-sim.yaml | 2 +- test/scripts/run_e2e.sh | 40 ---------------- 19 files changed, 104 insertions(+), 133 deletions(-) create mode 100644 scripts/pull_images.sh diff --git a/Makefile b/Makefile index 41fd228f..e6c7832a 100644 --- a/Makefile +++ b/Makefile @@ -8,9 +8,10 @@ TARGETOS ?= $(shell go env GOOS) TARGETARCH ?= $(shell go env GOARCH) PROJECT_NAME ?= llm-d-inference-scheduler IMAGE_REGISTRY ?= ghcr.io/llm-d -IMAGE_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME) +EPP_IMG_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME) EPP_TAG ?= dev -IMG = $(IMAGE_TAG_BASE):$(EPP_TAG) +EPP_IMAGE = $(EPP_IMG_TAG_BASE):$(EPP_TAG) +export EPP_IMAGE NAMESPACE ?= hc4ai-operator # Map go arch to typos arch @@ -95,7 +96,7 @@ test-integration: download-tokenizer install-dependencies ## Run integration tes go test -ldflags="$(LDFLAGS)" -v -tags=integration_tests ./test/integration/ .PHONY: test-e2e -test-e2e: image-build ## Run end-to-end tests against a new kind cluster +test-e2e: image-build image-pull ## Run end-to-end tests against a new kind cluster @printf "\033[33;1m==== Running End to End Tests ====\033[0m\n" ./test/scripts/run_e2e.sh @@ -120,20 +121,26 @@ build: check-go install-dependencies download-tokenizer ## Build the project ##@ Container Build/Push .PHONY: image-build -image-build: check-container-tool ## Build Docker image ## Build Docker image using $(CONTAINER_TOOL) - @printf "\033[33;1m==== Building Docker image $(IMG) ====\033[0m\n" +image-build: check-container-tool ## Build Docker image using $(CONTAINER_TOOL) + @printf "\033[33;1m==== Building Docker image $(EPP_IMAGE) ====\033[0m\n" $(CONTAINER_TOOL) build \ --platform linux/$(TARGETARCH) \ --build-arg TARGETOS=linux \ --build-arg TARGETARCH=$(TARGETARCH) \ --build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \ --build-arg BUILD_REF=${BUILD_REF} \ - -t $(IMG) . + -t $(EPP_IMAGE) . .PHONY: image-push -image-push: check-container-tool ## Push Docker image $(IMG) to registry - @printf "\033[33;1m==== Pushing Docker image $(IMG) ====\033[0m\n" - $(CONTAINER_TOOL) push $(IMG) +image-push: check-container-tool ## Push Docker image $(EPP_IMAGE) to registry + @printf "\033[33;1m==== Pushing Docker image $(EPP_IMAGE) ====\033[0m\n" + $(CONTAINER_TOOL) push $(EPP_IMAGE) + +.PHONY image-pull +image-pull: check-container-tool ## Pull all related images using $(CONTAINER_TOOL) + @printf "\033[33;1m==== Pulling Docker images ====\033[0m\n" + ./scripts/pull_images.sh + ##@ Install/Uninstall Targets @@ -149,7 +156,7 @@ uninstall: uninstall-docker ## Default uninstall using Docker .PHONY: install-docker install-docker: check-container-tool ## Install app using $(CONTAINER_TOOL) @echo "Starting container with $(CONTAINER_TOOL)..." - $(CONTAINER_TOOL) run -d --name $(PROJECT_NAME)-container $(IMG) + $(CONTAINER_TOOL) run -d --name $(PROJECT_NAME)-container $(EPP_IMAGE) @echo "$(CONTAINER_TOOL) installation complete." @echo "To use $(PROJECT_NAME), run:" @echo "alias $(PROJECT_NAME)='$(CONTAINER_TOOL) exec -it $(PROJECT_NAME)-container /app/$(PROJECT_NAME)'" @@ -194,12 +201,12 @@ uninstall-k8s: check-kubectl check-kustomize check-envsubst ## Uninstall from Ku .PHONY: install-openshift install-openshift: check-kubectl check-kustomize check-envsubst ## Install on OpenShift - @echo $$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION + @echo $$PROJECT_NAME $$NAMESPACE $$EPP_IMAGE @echo "Creating namespace $(NAMESPACE)..." kubectl create namespace $(NAMESPACE) 2>/dev/null || true @echo "Deploying common resources from deploy/ ..." # Build and substitute the base manifests from deploy, then apply them - kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl apply -n $(NAMESPACE) -f - + kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$EPP_IMAGE' | kubectl apply -n $(NAMESPACE) -f - @echo "Waiting for pod to become ready..." sleep 5 @POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -n $(NAMESPACE) -o jsonpath='{.items[0].metadata.name}'); \ @@ -210,9 +217,9 @@ install-openshift: check-kubectl check-kustomize check-envsubst ## Install on Op .PHONY: uninstall-openshift uninstall-openshift: check-kubectl check-kustomize check-envsubst ## Uninstall from OpenShift @echo "Removing resources from OpenShift..." - kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl delete --force -f - || true + kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$EPP_IMAGE' | kubectl delete --force -f - || true # @if kubectl api-resources --api-group=route.openshift.io | grep -q Route; then \ - # envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' < deploy/openshift/route.yaml | kubectl delete --force -f - || true; \ + # envsubst '$$PROJECT_NAME $$NAMESPACE $$EPP_IMAGE' < deploy/openshift/route.yaml | kubectl delete --force -f - || true; \ # fi @POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -n $(NAMESPACE) -o jsonpath='{.items[0].metadata.name}'); \ echo "Deleting pod: $$POD"; \ @@ -224,19 +231,20 @@ uninstall-openshift: check-kubectl check-kustomize check-envsubst ## Uninstall f .PHONY: install-rbac install-rbac: check-kubectl check-kustomize check-envsubst ## Install RBAC @echo "Applying RBAC configuration from deploy/rbac..." - kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl apply -f - + kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$EPP_IMAGE' | kubectl apply -f - .PHONY: uninstall-rbac uninstall-rbac: check-kubectl check-kustomize check-envsubst ## Uninstall RBAC @echo "Removing RBAC configuration from deploy/rbac..." - kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl delete -f - || true + kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$EPP_IMAGE' | kubectl delete -f - || true ##@ Environment .PHONY: env env: ## Print environment variables - @echo "IMAGE_TAG_BASE=$(IMAGE_TAG_BASE)" - @echo "IMG=$(IMG)" + @echo "EPP_IMAGE=$(EPP_IMAGE)" @echo "CONTAINER_TOOL=$(CONTAINER_TOOL)" + @echo "NAMESPACE=${NAMESPACE}" + @echo "GIT_COMMIT_SHA=${GIT_COMMIT_SHA}" .PHONY: check-typos check-typos: $(TYPOS) ## Check for spelling errors using typos (exits with error if found) @@ -356,8 +364,7 @@ env-dev-kind: ## Run under kind ($(KIND_CLUSTER_NAME)) $(MAKE) image-build && \ CLUSTER_NAME=$(KIND_CLUSTER_NAME) \ GATEWAY_HOST_PORT=$(KIND_GATEWAY_HOST_PORT) \ - IMAGE_REGISTRY=$(IMAGE_REGISTRY) \ - EPP_TAG=$(EPP_TAG) \ + EPP_IMAGE=$(EPP_IMAGE) \ ./scripts/kind-dev-env.sh; \ fi diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml index 1b86e8ed..57ab61c9 100644 --- a/deploy/components/inference-gateway/deployments.yaml +++ b/deploy/components/inference-gateway/deployments.yaml @@ -18,7 +18,7 @@ spec: terminationGracePeriodSeconds: 130 containers: - name: epp - image: ghcr.io/llm-d/llm-d-inference-scheduler:latest + image: ${EPP_IMAGE} imagePullPolicy: IfNotPresent args: - --pool-name diff --git a/deploy/components/inference-gateway/kustomization.yaml b/deploy/components/inference-gateway/kustomization.yaml index 68fd981b..7f6dec8f 100644 --- a/deploy/components/inference-gateway/kustomization.yaml +++ b/deploy/components/inference-gateway/kustomization.yaml @@ -18,7 +18,3 @@ resources: - deployments.yaml - gateways.yaml - httproutes.yaml - -images: -- name: ghcr.io/llm-d/llm-d-inference-scheduler - newTag: ${EPP_TAG} diff --git a/deploy/components/vllm-sim-pd/deployments.yaml b/deploy/components/vllm-sim-pd/deployments.yaml index 5a644b85..21368df3 100644 --- a/deploy/components/vllm-sim-pd/deployments.yaml +++ b/deploy/components/vllm-sim-pd/deployments.yaml @@ -17,7 +17,7 @@ spec: spec: containers: - name: vllm - image: ghcr.io/llm-d/llm-d-inference-sim:latest + image: ${VLLM_SIMULATOR_IMAGE} imagePullPolicy: IfNotPresent args: - "--port=8000" @@ -49,7 +49,7 @@ spec: spec: initContainers: - name: routing-sidecar - image: ghcr.io/llm-d/llm-d-routing-sidecar:latest + image: ${ROUTING_SIDECAR_IMAGE} imagePullPolicy: IfNotPresent args: - "--port=8000" @@ -61,7 +61,7 @@ spec: restartPolicy: Always containers: - name: vllm - image: ghcr.io/llm-d/llm-d-inference-sim:latest + image: ${VLLM_SIMULATOR_IMAGE} imagePullPolicy: IfNotPresent args: - "--port=8200" diff --git a/deploy/components/vllm-sim-pd/kustomization.yaml b/deploy/components/vllm-sim-pd/kustomization.yaml index 14d7f1b3..40ac17d3 100644 --- a/deploy/components/vllm-sim-pd/kustomization.yaml +++ b/deploy/components/vllm-sim-pd/kustomization.yaml @@ -10,9 +10,3 @@ kind: Kustomization resources: - deployments.yaml - -images: -- name: ghcr.io/llm-d/llm-d-inference-sim - newTag: ${VLLM_SIMULATOR_TAG} -- name: ghcr.io/llm-d/llm-d-routing-sidecar - newTag: ${ROUTING_SIDECAR_TAG} diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml index e6d8cae1..d116d53b 100644 --- a/deploy/components/vllm-sim/deployments.yaml +++ b/deploy/components/vllm-sim/deployments.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: vllm - image: ghcr.io/llm-d/llm-d-inference-sim:latest + image: ${VLLM_SIMULATOR_IMAGE} imagePullPolicy: IfNotPresent args: - "--port=8000" diff --git a/deploy/components/vllm-sim/kustomization.yaml b/deploy/components/vllm-sim/kustomization.yaml index 9748f239..40ac17d3 100644 --- a/deploy/components/vllm-sim/kustomization.yaml +++ b/deploy/components/vllm-sim/kustomization.yaml @@ -10,7 +10,3 @@ kind: Kustomization resources: - deployments.yaml - -images: -- name: ghcr.io/llm-d/llm-d-inference-sim - newTag: ${VLLM_SIMULATOR_TAG} diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml index 6b7c4430..4bddd5b7 100644 --- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml @@ -7,7 +7,7 @@ spec: spec: containers: - name: epp - image: ${EPP_IMAGE}:${EPP_TAG} + image: ${EPP_IMAGE} imagePullPolicy: Always args: - --pool-name diff --git a/deploy/environments/openshift-base/common/patch-statefulset.yaml b/deploy/environments/openshift-base/common/patch-statefulset.yaml index 5b7676ff..c3a8a39c 100644 --- a/deploy/environments/openshift-base/common/patch-statefulset.yaml +++ b/deploy/environments/openshift-base/common/patch-statefulset.yaml @@ -16,5 +16,5 @@ spec: serviceAccountName: operator-controller-manager containers: - name: cmd - image: ${IMAGE_TAG_BASE}:${VERSION} + image: ${EPP_IMAGE} imagePullPolicy: Always diff --git a/deploy/environments/openshift-base/kustomization.yaml b/deploy/environments/openshift-base/kustomization.yaml index c690de16..9fff1639 100644 --- a/deploy/environments/openshift-base/kustomization.yaml +++ b/deploy/environments/openshift-base/kustomization.yaml @@ -22,12 +22,6 @@ configMapGenerator: disableNameSuffixHash: true # Include patches to update the Service, StatefulSet, Route, and RBAC resources. - -# Define the image to be updated. -# images: -# - name: ghcr.io/llm-d/placeholder -# newName: ghcr.io/llm-d/${IMAGE_TAG_BASE} -# newTag: ${VERSION} patches: - path: common/patch-service.yaml - path: common/patch-statefulset.yaml diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh index 64770615..f8baf041 100755 --- a/scripts/kind-dev-env.sh +++ b/scripts/kind-dev-env.sh @@ -25,14 +25,11 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Set a default VLLM_SIMULATOR_IMAGE if not provided : "${VLLM_SIMULATOR_IMAGE:=llm-d-inference-sim}" -# Set a default VLLM_SIMULATOR_TAG if not provided -export VLLM_SIMULATOR_TAG="${VLLM_SIMULATOR_TAG:-latest}" +# Set a default VLLM_SIMULATOR_IMAGE if not provided +export VLLM_SIMULATOR_IMAGE="${VLLM_SIMULATOR_IMAGE:-ghcr.io/llm-d/llm-d-inference-sim:v0.4.0}" # Set a default EPP_IMAGE if not provided -: "${EPP_IMAGE:=llm-d-inference-scheduler}" - -# Set a default EPP_TAG if not provided -export EPP_TAG="${EPP_TAG:-dev}" +export EPP_IMAGE="${EPP_IMAGE:-ghcr.io/llm-d/llm-d-inference-scheduler:dev}" # Set the model name to deploy export MODEL_NAME="${MODEL_NAME:-food-review}" @@ -46,8 +43,8 @@ export MODEL_NAME_SAFE=$(echo "${MODEL_ID}" | tr '[:upper:]' '[:lower:]' | tr ' # Set the endpoint-picker to deploy export EPP_NAME="${EPP_NAME:-${MODEL_NAME_SAFE}-endpoint-picker}" -# Set the default routing side car image tag -export ROUTING_SIDECAR_TAG="${ROUTING_SIDECAR_TAG:-0.0.6}" +# Set the default routing side car image +export ROUTING_SIDECAR_IMAGE="${ROUTING_SIDECAR_IMAGE:-ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0}" # Set the inference pool name for the deployment export POOL_NAME="${POOL_NAME:-${MODEL_NAME_SAFE}-inference-pool}" @@ -152,19 +149,19 @@ kubectl --context ${KUBE_CONTEXT} -n local-path-storage wait --for=condition=Rea # Load the vllm simulator image into the cluster if [ "${CONTAINER_RUNTIME}" == "podman" ]; then - podman save ${IMAGE_REGISTRY}/${VLLM_SIMULATOR_IMAGE}:${VLLM_SIMULATOR_TAG} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin + podman save ${VLLM_SIMULATOR_IMAGE} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin else - if docker image inspect "${IMAGE_REGISTRY}/${VLLM_SIMULATOR_IMAGE}:${VLLM_SIMULATOR_TAG}" > /dev/null 2>&1; then + if docker image inspect ${VLLM_SIMULATOR_IMAGE} > /dev/null 2>&1; then echo "INFO: Loading image into KIND cluster..." - kind --name ${CLUSTER_NAME} load docker-image ${IMAGE_REGISTRY}/${VLLM_SIMULATOR_IMAGE}:${VLLM_SIMULATOR_TAG} + kind --name ${CLUSTER_NAME} load docker-image ${VLLM_SIMULATOR_IMAGE} fi fi # Load the ext_proc endpoint-picker image into the cluster if [ "${CONTAINER_RUNTIME}" == "podman" ]; then - podman save ${IMAGE_REGISTRY}/${EPP_IMAGE}:${EPP_TAG} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin + podman save ${EPP_IMAGE} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin else - kind --name ${CLUSTER_NAME} load docker-image ${IMAGE_REGISTRY}/${EPP_IMAGE}:${EPP_TAG} + kind --name ${CLUSTER_NAME} load docker-image ${EPP_IMAGE} fi # ------------------------------------------------------------------------------ # CRD Deployment (Gateway API + GIE) @@ -194,8 +191,8 @@ kubectl --context ${KUBE_CONTEXT} delete configmap epp-config --ignore-not-found kubectl --context ${KUBE_CONTEXT} create configmap epp-config --from-file=epp-config.yaml=${EPP_CONFIG} kustomize build --enable-helm ${KUSTOMIZE_DIR} \ - | envsubst '${POOL_NAME} ${MODEL_NAME} ${MODEL_NAME_SAFE} ${EPP_NAME} ${EPP_TAG} ${VLLM_SIMULATOR_TAG} \ - ${PD_ENABLED} ${KV_CACHE_ENABLED} ${ROUTING_SIDECAR_TAG} \ + | envsubst '${POOL_NAME} ${MODEL_NAME} ${MODEL_NAME_SAFE} ${EPP_NAME} ${EPP_IMAGE} ${VLLM_SIMULATOR_IMAGE} \ + ${PD_ENABLED} ${KV_CACHE_ENABLED} ${ROUTING_SIDECAR_IMAGE} \ ${VLLM_REPLICA_COUNT} ${VLLM_REPLICA_COUNT_P} ${VLLM_REPLICA_COUNT_D}' \ | kubectl --context ${KUBE_CONTEXT} apply -f - diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index 71f355d2..27cd1bc7 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -31,9 +31,6 @@ if [[ ! -f "$VLLM_CHART_DIR/Chart.yaml" ]]; then exit 1 fi -# Default image registry for pulling deployment images -export IMAGE_REGISTRY="${IMAGE_REGISTRY:-ghcr.io/llm-d}" - # ----------------------------------------------------------------------------- # Model Configuration # ----------------------------------------------------------------------------- @@ -72,10 +69,7 @@ export POOL_NAME="${POOL_NAME:-${MODEL_NAME_SAFE}-inference-pool}" export EPP_NAME="${EPP_NAME:-${MODEL_NAME_SAFE}-endpoint-picker}" # EPP container image name -export EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler}" - -# EPP image tag -export EPP_TAG="${EPP_TAG:-v0.1.0}" +export EPP_IMAGE="${EPP_IMAGE:-ghcr.io/llm-d/llm-d-inference-scheduler:dev}" # Whether P/D mode is enabled for this deployment export PD_ENABLED="\"${PD_ENABLED:-false}\"" diff --git a/scripts/pull_images.sh b/scripts/pull_images.sh new file mode 100644 index 00000000..61e8fbee --- /dev/null +++ b/scripts/pull_images.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Use the CONTAINER_TOOL from the environment, or default to docker if it's not set. +CONTAINER_TOOL="${CONTAINER_TOOL:-docker}" +echo "Using container tool: ${CONTAINER_TOOL}" + +export EPP_IMAGE="${EPP_IMAGE:-ghcr.io/llm-d/llm-d-inference-scheduler:dev}" +export VLLM_SIMULATOR_IMAGE="${VLLM_SIMULATOR_IMAGE:-ghcr.io/llm-d/llm-d-inference-sim:v0.4.0}" +export ROUTING_SIDECAR_IMAGE="${ROUTING_SIDECAR_IMAGE:-ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0}" + +# --- Print Final Images and Pull Dependencies --- +echo "--- Using the following images for the E2E test ---" +echo "Scheduler Image: ${EPP_IMAGE}" +echo "Simulator Image: ${VLLM_SIMULATOR_IMAGE}" +echo "Sidecar Image: ${ROUTING_SIDECAR_IMAGE}" +echo "----------------------------------------------------" + +echo "Pulling dependencies..." +${CONTAINER_TOOL} pull ${EPP_IMAGE} +if [[ $? != 0 ]]; then + echo "Failed to pull ${EPP_IMAGE}" + exit 1 +fi + +${CONTAINER_TOOL} pull ${VLLM_SIMULATOR_IMAGE} +if [[ $? != 0 ]]; then + echo "Failed to pull ${VLLM_SIMULATOR_IMAGE}" + exit 1 +fi + +${CONTAINER_TOOL} pull ${ROUTING_SIDECAR_IMAGE} +if [[ $? != 0 ]]; then + echo "Failed to pull ${ROUTING_SIDECAR_IMAGE}" + exit 1 +fi +echo "Successfully pulled dependencies" diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index a69246b2..8ccff2d9 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -60,9 +60,9 @@ var ( port string scheme = runtime.NewScheme() - eppTag = env.GetEnvString("EPP_TAG", "dev", ginkgo.GinkgoLogr) - vllmSimTag = env.GetEnvString("VLLM_SIMULATOR_TAG", "dev", ginkgo.GinkgoLogr) - routingSideCarTag = env.GetEnvString("ROUTING_SIDECAR_TAG", "v0.2.0", ginkgo.GinkgoLogr) + eppImage = env.GetEnvString("EPP_IMAGE", "ghcr.io/llm-d/llm-d-inference-scheduler:dev", ginkgo.GinkgoLogr) + vllmSimImage = env.GetEnvString("VLLM_SIMULATOR_IMAGE", "ghcr.io/llm-d/llm-d-inference-sim:dev", ginkgo.GinkgoLogr) + routingSideCarImage = env.GetEnvString("ROUTING_SIDECAR_IMAGE", "ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0", ginkgo.GinkgoLogr) existsTimeout = env.GetEnvDuration("EXISTS_TIMEOUT", defaultExistsTimeout, ginkgo.GinkgoLogr) readyTimeout = env.GetEnvDuration("READY_TIMEOUT", defaultReadyTimeout, ginkgo.GinkgoLogr) @@ -118,20 +118,17 @@ func setupK8sCluster() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", - "ghcr.io/llm-d/llm-d-inference-sim:"+vllmSimTag) + command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", vllmSimImage) session, err = gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", - "ghcr.io/llm-d/llm-d-inference-scheduler:"+eppTag) + command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", eppImage) session, err = gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", - "ghcr.io/llm-d/llm-d-routing-sidecar:"+routingSideCarTag) + command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", routingSideCarImage) session, err = gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 54bf48e8..7b49954f 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -146,15 +146,15 @@ func createModelServers(withPD, withKV bool, vllmReplicas, prefillReplicas, deco manifests := readYaml(yaml) manifests = substituteMany(manifests, map[string]string{ - "${MODEL_NAME}": theModelName, - "${MODEL_NAME_SAFE}": theSafeModelName, - "${POOL_NAME}": poolName, - "${KV_CACHE_ENABLED}": strconv.FormatBool(withKV), - "${ROUTING_SIDECAR_TAG}": routingSideCarTag, - "${VLLM_REPLICA_COUNT}": strconv.Itoa(vllmReplicas), - "${VLLM_REPLICA_COUNT_D}": strconv.Itoa(decodeReplicas), - "${VLLM_REPLICA_COUNT_P}": strconv.Itoa(prefillReplicas), - "${VLLM_SIMULATOR_TAG}": vllmSimTag, + "${MODEL_NAME}": theModelName, + "${MODEL_NAME_SAFE}": theSafeModelName, + "${POOL_NAME}": poolName, + "${KV_CACHE_ENABLED}": strconv.FormatBool(withKV), + "${ROUTING_SIDECAR_IMAGE}": routingSideCarImage, + "${VLLM_REPLICA_COUNT}": strconv.Itoa(vllmReplicas), + "${VLLM_REPLICA_COUNT_D}": strconv.Itoa(decodeReplicas), + "${VLLM_REPLICA_COUNT_P}": strconv.Itoa(prefillReplicas), + "${VLLM_SIMULATOR_IMAGE}": vllmSimImage, }) objects := createObjsFromYaml(manifests) @@ -183,7 +183,7 @@ func createEndPointPicker(eppConfig string) []string { eppYamls := readYaml(eppManifest) eppYamls = substituteMany(eppYamls, map[string]string{ - "${EPP_TAG}": eppTag, + "${EPP_IMAGE}": eppImage, "${POOL_NAME}": modelName + "-inference-pool", }) diff --git a/test/e2e/yaml/deployments.yaml b/test/e2e/yaml/deployments.yaml index 041ce44c..2f47722b 100644 --- a/test/e2e/yaml/deployments.yaml +++ b/test/e2e/yaml/deployments.yaml @@ -18,7 +18,7 @@ spec: terminationGracePeriodSeconds: 130 containers: - name: epp - image: ghcr.io/llm-d/llm-d-inference-scheduler:${EPP_TAG} + image: ${EPP_IMAGE} imagePullPolicy: IfNotPresent args: - --pool-name diff --git a/test/e2e/yaml/vllm-sim-pd.yaml b/test/e2e/yaml/vllm-sim-pd.yaml index c3f38f7b..779f397f 100644 --- a/test/e2e/yaml/vllm-sim-pd.yaml +++ b/test/e2e/yaml/vllm-sim-pd.yaml @@ -17,7 +17,7 @@ spec: spec: containers: - name: vllm - image: ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG} + image: ${VLLM_SIMULATOR_IMAGE} imagePullPolicy: IfNotPresent args: - "--port=8000" @@ -60,7 +60,7 @@ spec: spec: initContainers: - name: routing-sidecar - image: ghcr.io/llm-d/llm-d-routing-sidecar:${ROUTING_SIDECAR_TAG} + image: ${ROUTING_SIDECAR_IMAGE} imagePullPolicy: IfNotPresent args: - "--port=8000" @@ -74,7 +74,7 @@ spec: restartPolicy: Always containers: - name: vllm - image: ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG} + image: ${VLLM_SIMULATOR_IMAGE} imagePullPolicy: IfNotPresent args: - "--port=8200" diff --git a/test/e2e/yaml/vllm-sim.yaml b/test/e2e/yaml/vllm-sim.yaml index ce5a71af..36036996 100644 --- a/test/e2e/yaml/vllm-sim.yaml +++ b/test/e2e/yaml/vllm-sim.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: vllm - image: ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG} + image: ${VLLM_SIMULATOR_IMAGE} imagePullPolicy: IfNotPresent args: - "--mode=echo" diff --git a/test/scripts/run_e2e.sh b/test/scripts/run_e2e.sh index fe13508a..5d81e4ce 100755 --- a/test/scripts/run_e2e.sh +++ b/test/scripts/run_e2e.sh @@ -1,45 +1,5 @@ #!/bin/bash -# Use the CONTAINER_TOOL from the environment, or default to docker if it's not set. -CONTAINER_TOOL="${CONTAINER_TOOL:-docker}" -echo "Using container tool: ${CONTAINER_TOOL}" - -# Set a default EPP_TAG if not provided -export EPP_TAG="${EPP_TAG:-dev}" - -# Set a default VLLM_SIMULATOR_TAG if not provided -export VLLM_SIMULATOR_TAG="${VLLM_SIMULATOR_TAG:-v0.4.0}" - -# Set the default routing side car image tag -export ROUTING_SIDECAR_TAG="${ROUTING_SIDECAR_TAG:-v0.2.0}" - -SIMTAG=$(${CONTAINER_TOOL} images | grep ghcr.io/llm-d/llm-d-inference-sim | awk '{print $2}' | grep ${VLLM_SIMULATOR_TAG}) -if [[ "${SIMTAG}" != "${VLLM_SIMULATOR_TAG}" ]]; then - ${CONTAINER_TOOL} pull ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG} - if [[ $? != 0 ]]; then - echo "Failed to pull ghcr.io/llm-d/llm-d-inference-sim:${VLLM_SIMULATOR_TAG}" - exit 1 - fi -fi - -EPPTAG=$(${CONTAINER_TOOL} images | grep ghcr.io/llm-d/llm-d-inference-scheduler | awk '{print $2}' | grep ${EPP_TAG}) -if [[ "${EPPTAG}" != "${EPP_TAG}" ]]; then - ${CONTAINER_TOOL} pull ghcr.io/llm-d/llm-d-inference-scheduler:${EPP_TAG} - if [[ $? != 0 ]]; then - echo "Failed to pull ghcr.io/llm-d/llm-d-inference-scheduler:${EPP_TAG}" - exit 1 - fi -fi - -SIDECARTAG=$(${CONTAINER_TOOL} images | grep ghcr.io/llm-d/llm-d-routing-sidecar | awk '{print $2}' | grep ${ROUTING_SIDECAR_TAG}) -if [[ "${SIDECARTAG}" != "${ROUTING_SIDECAR_TAG}" ]]; then - ${CONTAINER_TOOL} pull ghcr.io/llm-d/llm-d-routing-sidecar:${ROUTING_SIDECAR_TAG} - if [[ $? != 0 ]]; then - echo "Failed to pull ghcr.io/llm-d/llm-d-routing-sidecar:${ROUTING_SIDECAR_TAG}" - exit 1 - fi -fi - echo "Running end to end tests" DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" From 40a7a9080ef8e512ff5e47333b89baff7c18c7d6 Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Mon, 6 Oct 2025 15:02:59 -0400 Subject: [PATCH 04/10] Cleanup --- scripts/kind-dev-env.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh index f8baf041..2b0f261c 100755 --- a/scripts/kind-dev-env.sh +++ b/scripts/kind-dev-env.sh @@ -23,10 +23,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" : "${IMAGE_REGISTRY:=ghcr.io/llm-d}" # Set a default VLLM_SIMULATOR_IMAGE if not provided -: "${VLLM_SIMULATOR_IMAGE:=llm-d-inference-sim}" - -# Set a default VLLM_SIMULATOR_IMAGE if not provided -export VLLM_SIMULATOR_IMAGE="${VLLM_SIMULATOR_IMAGE:-ghcr.io/llm-d/llm-d-inference-sim:v0.4.0}" +export VLLM_SIMULATOR_IMAGE="${VLLM_SIMULATOR_IMAGE:-ghcr.io/llm-d/llm-d-inference-sim:latest}" # Set a default EPP_IMAGE if not provided export EPP_IMAGE="${EPP_IMAGE:-ghcr.io/llm-d/llm-d-inference-scheduler:dev}" From 40e875efc5cdcda0f5e8b9301a22256dd3c845a6 Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Mon, 6 Oct 2025 15:06:48 -0400 Subject: [PATCH 05/10] Set VLLM_SIMULATOR_IMAGE and ROUTING_SIDECAR_IMAGE from the makefile --- Makefile | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e6c7832a..bac1fe03 100644 --- a/Makefile +++ b/Makefile @@ -6,13 +6,23 @@ SHELL := /usr/bin/env bash # Defaults TARGETOS ?= $(shell go env GOOS) TARGETARCH ?= $(shell go env GOARCH) +NAMESPACE ?= hc4ai-operator +# Image default PROJECT_NAME ?= llm-d-inference-scheduler IMAGE_REGISTRY ?= ghcr.io/llm-d EPP_IMG_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME) EPP_TAG ?= dev EPP_IMAGE = $(EPP_IMG_TAG_BASE):$(EPP_TAG) export EPP_IMAGE -NAMESPACE ?= hc4ai-operator +# Image dependency defaults +VLLM_SIM_IMG_TAG_BASE ?= $(IMAGE_REGISTRY)/llm-d-inference-sim +VLLM_SIMULATOR_TAG ?= latest +VLLM_SIMULATOR_IMAGE = $(VLLM_SIM_IMG_TAG_BASE):${VLLM_SIMULATOR_TAG} +export VLLM_SIMULATOR_IMAGE +ROUTING_SIDECAR_IMG_TAG_BASE ?= $(IMAGE_REGISTRY)/llm-d-routing-sidecar +ROUTING_SIDECAR_TAG ?= v0.2.0 +ROUTING_SIDECAR_IMAGE = ${ROUTING_SIDECAR_IMG_TAG_BASE}:${ROUTING_SIDECAR_TAG} +export ROUTING_SIDECAR_IMAGE # Map go arch to typos arch ifeq ($(TARGETARCH),amd64) From 47949982ceeedd69062082afbe260a72041b22e6 Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Mon, 13 Oct 2025 12:07:22 -0400 Subject: [PATCH 06/10] Make the image pull script more robust to local-only image tags Signed-off-by: Hannah DeFazio --- Makefile | 2 +- scripts/pull_images.sh | 50 ++++++++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 20 deletions(-) mode change 100644 => 100755 scripts/pull_images.sh diff --git a/Makefile b/Makefile index bac1fe03..1313c4e1 100644 --- a/Makefile +++ b/Makefile @@ -146,7 +146,7 @@ image-push: check-container-tool ## Push Docker image $(EPP_IMAGE) to registry @printf "\033[33;1m==== Pushing Docker image $(EPP_IMAGE) ====\033[0m\n" $(CONTAINER_TOOL) push $(EPP_IMAGE) -.PHONY image-pull +.PHONY: image-pull image-pull: check-container-tool ## Pull all related images using $(CONTAINER_TOOL) @printf "\033[33;1m==== Pulling Docker images ====\033[0m\n" ./scripts/pull_images.sh diff --git a/scripts/pull_images.sh b/scripts/pull_images.sh old mode 100644 new mode 100755 index 61e8fbee..3cfb32be --- a/scripts/pull_images.sh +++ b/scripts/pull_images.sh @@ -4,33 +4,45 @@ CONTAINER_TOOL="${CONTAINER_TOOL:-docker}" echo "Using container tool: ${CONTAINER_TOOL}" -export EPP_IMAGE="${EPP_IMAGE:-ghcr.io/llm-d/llm-d-inference-scheduler:dev}" +export EPP_IMAGE="${EPP_IMAGE:-ghcr.io/llm-d/llm-d-inference-scheduler:latest}" export VLLM_SIMULATOR_IMAGE="${VLLM_SIMULATOR_IMAGE:-ghcr.io/llm-d/llm-d-inference-sim:v0.4.0}" export ROUTING_SIDECAR_IMAGE="${ROUTING_SIDECAR_IMAGE:-ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0}" +# --- Helper Function to Ensure Image Availability --- +# This function checks the registry first, then falls back to a local-only check. +ensure_image() { + local image_name="$1" + echo "Checking for image: ${image_name}" + + # Attempt to inspect the image manifest on the remote registry. + if ${CONTAINER_TOOL} manifest inspect "${image_name}" > /dev/null 2>&1; then + echo " -> Image found on registry. Pulling..." + if ! ${CONTAINER_TOOL} pull "${image_name}"; then + echo " ❌ ERROR: Failed to pull image '${image_name}'." + exit 1 + fi + echo " ✅ Successfully pulled image." + else + # If the image is not on the registry, check if it's already available locally. + echo " -> Image not found on registry. Checking for a local version..." + if [ -z "$(${CONTAINER_TOOL} images -q "${image_name}")" ]; then + # If it's not on the registry AND not local, it's an error. + echo " ❌ ERROR: Image '${image_name}' is not available locally and could not be found on the registry." + exit 1 + fi + echo " -> Found local-only image. Proceeding." + fi +} + # --- Print Final Images and Pull Dependencies --- -echo "--- Using the following images for the E2E test ---" +echo "--- Using the following images ---" echo "Scheduler Image: ${EPP_IMAGE}" echo "Simulator Image: ${VLLM_SIMULATOR_IMAGE}" echo "Sidecar Image: ${ROUTING_SIDECAR_IMAGE}" echo "----------------------------------------------------" echo "Pulling dependencies..." -${CONTAINER_TOOL} pull ${EPP_IMAGE} -if [[ $? != 0 ]]; then - echo "Failed to pull ${EPP_IMAGE}" - exit 1 -fi - -${CONTAINER_TOOL} pull ${VLLM_SIMULATOR_IMAGE} -if [[ $? != 0 ]]; then - echo "Failed to pull ${VLLM_SIMULATOR_IMAGE}" - exit 1 -fi - -${CONTAINER_TOOL} pull ${ROUTING_SIDECAR_IMAGE} -if [[ $? != 0 ]]; then - echo "Failed to pull ${ROUTING_SIDECAR_IMAGE}" - exit 1 -fi +ensure_image "${EPP_IMAGE}" +ensure_image "${VLLM_SIMULATOR_IMAGE}" +ensure_image "${ROUTING_SIDECAR_IMAGE}" echo "Successfully pulled dependencies" From 2d376a20686e45b15dd4b3d60022eee8e260f56b Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Thu, 16 Oct 2025 20:06:42 -0400 Subject: [PATCH 07/10] Allow the e2e test setup to use podman --- test/e2e/e2e_suite_test.go | 63 +++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 8ccff2d9..6a4de257 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -3,6 +3,7 @@ package e2e import ( "context" "io" + "os" "os/exec" "strings" "testing" @@ -100,6 +101,51 @@ var _ = ginkgo.AfterSuite(func() { gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) }) +// loadImageIntoKind loads the specified image +// into the Kind cluster using the most appropriate method based on the container runtime. +func loadImageIntoKind(imageName string) { + container_runtime := env.GetEnvString("CONTAINER_TOOL", "docker", ginkgo.GinkgoLogr) + ginkgo.By("Loading image into Kind cluster: " + imageName) + + switch container_runtime { + case "podman": + // Detect if podman is available + podmanPath, podmanErr := exec.LookPath("podman") + gomega.Expect(podmanErr).ShouldNot(gomega.HaveOccurred(), "Could not find podman in PATH") + ginkgo.GinkgoLogr.Info("Podman detected, using image-archive method.", "path", podmanPath) + + // Create a temporary file to hold the image archive. + tmpFile, err := os.CreateTemp("", "image-archive-*.tar") + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + // Ensure the temporary file is cleaned up when the function exits. + defer os.Remove(tmpFile.Name()) + + // Save the image to the temp file + cmdPodmanSave := exec.Command("podman", "save", "-o", tmpFile.Name(), imageName) + saveSession, err := gexec.Start(cmdPodmanSave, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + gomega.Eventually(saveSession).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) + + // Load the temp file image into kind + cmdKindLoad := exec.Command("kind", "load", "image-archive", "--name", "e2e-tests", tmpFile.Name()) + loadSession, err := gexec.Start(cmdKindLoad, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + gomega.Eventually(loadSession).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) + case "docker": + // Detect if podman is available + dockerPath, dockerErr := exec.LookPath("docker") + gomega.Expect(dockerErr).ShouldNot(gomega.HaveOccurred(), "Could not find docker in PATH") + + ginkgo.GinkgoLogr.Info("Docker detected, using docker-image method.", "path", dockerPath) + command := exec.Command("kind", "load", "docker-image", "--name", "e2e-tests", imageName) + session, err := gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) + default: + ginkgo.Fail("ERROR: Could not find 'podman' or 'docker' in the system's PATH. Please install one to continue.") + } +} + // Create the Kubernetes cluster for the E2E tests and load the local images func setupK8sCluster() { command := exec.Command("kind", "create", "cluster", "--name", "e2e-tests", "--config", "-") @@ -118,20 +164,9 @@ func setupK8sCluster() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", vllmSimImage) - session, err = gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) - gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) - gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - - command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", eppImage) - session, err = gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) - gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) - gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - - command = exec.Command("kind", "--name", "e2e-tests", "load", "docker-image", routingSideCarImage) - session, err = gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) - gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) - gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) + loadImageIntoKind(vllmSimImage) + loadImageIntoKind(eppImage) + loadImageIntoKind(routingSideCarImage) } func setupK8sClient() { From 5a23fdc42fea5cc8a5ff6f095a91c6ae3e3457ae Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Thu, 16 Oct 2025 20:32:52 -0400 Subject: [PATCH 08/10] Load CONTAINER_TOOL only once --- test/e2e/e2e_suite_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 6a4de257..294aa944 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -61,6 +61,7 @@ var ( port string scheme = runtime.NewScheme() + container_runtime = env.GetEnvString("CONTAINER_TOOL", "docker", ginkgo.GinkgoLogr) eppImage = env.GetEnvString("EPP_IMAGE", "ghcr.io/llm-d/llm-d-inference-scheduler:dev", ginkgo.GinkgoLogr) vllmSimImage = env.GetEnvString("VLLM_SIMULATOR_IMAGE", "ghcr.io/llm-d/llm-d-inference-sim:dev", ginkgo.GinkgoLogr) routingSideCarImage = env.GetEnvString("ROUTING_SIDECAR_IMAGE", "ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0", ginkgo.GinkgoLogr) @@ -103,8 +104,7 @@ var _ = ginkgo.AfterSuite(func() { // loadImageIntoKind loads the specified image // into the Kind cluster using the most appropriate method based on the container runtime. -func loadImageIntoKind(imageName string) { - container_runtime := env.GetEnvString("CONTAINER_TOOL", "docker", ginkgo.GinkgoLogr) +func loadImageIntoKind(container_runtime string, imageName string) { ginkgo.By("Loading image into Kind cluster: " + imageName) switch container_runtime { @@ -164,9 +164,9 @@ func setupK8sCluster() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - loadImageIntoKind(vllmSimImage) - loadImageIntoKind(eppImage) - loadImageIntoKind(routingSideCarImage) + loadImageIntoKind(container_runtime, vllmSimImage) + loadImageIntoKind(container_runtime, eppImage) + loadImageIntoKind(container_runtime, routingSideCarImage) } func setupK8sClient() { From f3332443003d40363b52f84f1ded15060c58f1ad Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Thu, 16 Oct 2025 20:36:58 -0400 Subject: [PATCH 09/10] Don't check if docker is installed as it is not directly run here --- test/e2e/e2e_suite_test.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 294aa944..57370c9e 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -132,17 +132,13 @@ func loadImageIntoKind(container_runtime string, imageName string) { gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(loadSession).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) case "docker": - // Detect if podman is available - dockerPath, dockerErr := exec.LookPath("docker") - gomega.Expect(dockerErr).ShouldNot(gomega.HaveOccurred(), "Could not find docker in PATH") - - ginkgo.GinkgoLogr.Info("Docker detected, using docker-image method.", "path", dockerPath) + ginkgo.GinkgoLogr.Info("Docker detected, using docker-image method.") command := exec.Command("kind", "load", "docker-image", "--name", "e2e-tests", imageName) session, err := gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter) gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) default: - ginkgo.Fail("ERROR: Could not find 'podman' or 'docker' in the system's PATH. Please install one to continue.") + ginkgo.Fail("ERROR: The CONTAINER_TOOL value must be 'docker' or 'podman'") } } From 399d919465af3ea0c5a336a030837e82264df13a Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Thu, 16 Oct 2025 20:38:52 -0400 Subject: [PATCH 10/10] container_runtime does not need to be passed to the function --- test/e2e/e2e_suite_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 57370c9e..d733ceac 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -104,7 +104,7 @@ var _ = ginkgo.AfterSuite(func() { // loadImageIntoKind loads the specified image // into the Kind cluster using the most appropriate method based on the container runtime. -func loadImageIntoKind(container_runtime string, imageName string) { +func loadImageIntoKind(imageName string) { ginkgo.By("Loading image into Kind cluster: " + imageName) switch container_runtime { @@ -160,9 +160,9 @@ func setupK8sCluster() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) gomega.Eventually(session).WithTimeout(600 * time.Second).Should(gexec.Exit(0)) - loadImageIntoKind(container_runtime, vllmSimImage) - loadImageIntoKind(container_runtime, eppImage) - loadImageIntoKind(container_runtime, routingSideCarImage) + loadImageIntoKind(vllmSimImage) + loadImageIntoKind(eppImage) + loadImageIntoKind(routingSideCarImage) } func setupK8sClient() {