diff --git a/.gitignore b/.gitignore index f0e0f9876..c0507f700 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ *.dll *.so *.dylib +bin/ +.vscode/ # Test binary, built with `go test -c` *.test @@ -14,6 +16,9 @@ # Dependency directories (remove the comment below to include it) vendor/ +# Kind config file +kind-config.yaml + # Miscellaneous .DS_Store ~* diff --git a/Dockerfile b/Dockerfile index d78a93dd4..348b8372c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,30 +1,33 @@ -# Use a multi-stage build -FROM golang:1.23-bookworm AS builder +# Build the manager binary +FROM docker.io/golang:1.23 AS builder +ARG TARGETOS +ARG TARGETARCH -# Install the lpsolve package -RUN apt-get update && apt-get install -y liblpsolve55-dev +WORKDIR /workspace +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum +# cache deps before building and copying source so that we don't need to re-download as much +# and so that source changes don't invalidate our downloaded layer +RUN go mod download -WORKDIR /app -COPY . . +# Copy the go source +COPY cmd/main.go cmd/main.go +COPY api/ api/ +COPY internal/ internal/ -# Set CGO flags for lpsolve package -ENV CGO_CFLAGS="-I/usr/include/lpsolve" -ENV CGO_LDFLAGS="-llpsolve55 -lm -ldl -lcolamd" +# Build +# the GOARCH has not a default value to allow the binary be built according to the host where the command +# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO +# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, +# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. +RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go -# Build all main.go files in cmd directory -RUN for file in $(find cmd -name "main.go"); do \ - dir=$(dirname "$file"); \ - name=$(basename "$dir"); \ - go build -o bin/$name $file; \ - done +# Use distroless as minimal base image to package the manager binary +# Refer to https://github.com/GoogleContainerTools/distroless for more details +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/manager . +USER 65532:65532 -# Create the final image -FROM debian:bookworm-slim -RUN apt-get update && apt-get install -y liblpsolve55-dev -COPY --from=builder /app/bin /bin - -# Expose the port the API will listen on -EXPOSE 8080 - -# Command to run the binary when the container starts -CMD ["optimizer"] \ No newline at end of file +ENTRYPOINT ["/manager"] diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..d95a2bbb4 --- /dev/null +++ b/Makefile @@ -0,0 +1,225 @@ +# Image URL to use all building/pushing image targets +IMG ?= controller:latest + +# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) +ifeq (,$(shell go env GOBIN)) +GOBIN=$(shell go env GOPATH)/bin +else +GOBIN=$(shell go env GOBIN) +endif + +# CONTAINER_TOOL defines the container tool to be used for building images. +# Be aware that the target commands are only tested with Docker which is +# scaffolded by default. However, you might want to replace it to use other +# tools. (i.e. podman) +CONTAINER_TOOL ?= docker + +# Setting SHELL to bash allows bash commands to be executed by recipes. +# Options are set to exit when a recipe line exits non-zero or a piped command fails. +SHELL = /usr/bin/env bash -o pipefail +.SHELLFLAGS = -ec + +.PHONY: all +all: build + +##@ General + +# The help target prints out all targets with their descriptions organized +# beneath their categories. The categories are represented by '##@' and the +# target descriptions by '##'. The awk command is responsible for reading the +# entire set of makefiles included in this invocation, looking for lines of the +# file as xyz: ## something, and then pretty-format the target and help. Then, +# if there's a line with ##@ something, that gets pretty-printed as a category. +# More info on the usage of ANSI control characters for terminal formatting: +# https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters +# More info on the awk command: +# http://linuxcommand.org/lc3_adv_awk.php + +.PHONY: help +help: ## Display this help. + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ Development + +.PHONY: manifests +manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. + $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + +.PHONY: generate +generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + +.PHONY: fmt +fmt: ## Run go fmt against code. + go fmt ./... + +.PHONY: vet +vet: ## Run go vet against code. + go vet ./... + +.PHONY: test +test: manifests generate fmt vet setup-envtest ## Run tests. + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out + +# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'. +# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally. +# CertManager is installed by default; skip with: +# - CERT_MANAGER_INSTALL_SKIP=true +.PHONY: test-e2e +test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind. + @command -v $(KIND) >/dev/null 2>&1 || { \ + echo "Kind is not installed. Please install Kind manually."; \ + exit 1; \ + } + @$(KIND) get clusters | grep -q 'kind' || { \ + echo "No Kind cluster is running. Please start a Kind cluster before running the e2e tests."; \ + exit 1; \ + } + go test ./test/e2e/ -v -ginkgo.v + +.PHONY: lint +lint: golangci-lint ## Run golangci-lint linter + $(GOLANGCI_LINT) run + +.PHONY: lint-fix +lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes + $(GOLANGCI_LINT) run --fix + +.PHONY: lint-config +lint-config: golangci-lint ## Verify golangci-lint linter configuration + $(GOLANGCI_LINT) config verify + +##@ Build + +.PHONY: build +build: manifests generate fmt vet ## Build manager binary. + go build -o bin/manager cmd/main.go + +.PHONY: run +run: manifests generate fmt vet ## Run a controller from your host. + go run ./cmd/main.go + +# If you wish to build the manager image targeting other platforms you can use the --platform flag. +# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. +# More info: https://docs.docker.com/develop/develop-images/build_enhancements/ +.PHONY: docker-build +docker-build: ## Build docker image with the manager. + $(CONTAINER_TOOL) build -t ${IMG} . + +.PHONY: docker-push +docker-push: ## Push docker image with the manager. + $(CONTAINER_TOOL) push ${IMG} + +# PLATFORMS defines the target platforms for the manager image be built to provide support to multiple +# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to: +# - be able to use docker buildx. More info: https://docs.docker.com/build/buildx/ +# - have enabled BuildKit. More info: https://docs.docker.com/develop/develop-images/build_enhancements/ +# - be able to push the image to your registry (i.e. if you do not set a valid value via IMG=> then the export will fail) +# To adequately provide solutions that are compatible with multiple platforms, you should consider using this option. +PLATFORMS ?= linux/arm64,linux/amd64,linux/s390x,linux/ppc64le +.PHONY: docker-buildx +docker-buildx: ## Build and push docker image for the manager for cross-platform support + # copy existing Dockerfile and insert --platform=${BUILDPLATFORM} into Dockerfile.cross, and preserve the original Dockerfile + sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross + - $(CONTAINER_TOOL) buildx create --name inferno-autoscaler-builder + $(CONTAINER_TOOL) buildx use inferno-autoscaler-builder + - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . + - $(CONTAINER_TOOL) buildx rm inferno-autoscaler-builder + rm Dockerfile.cross + +.PHONY: build-installer +build-installer: manifests generate kustomize ## Generate a consolidated YAML with CRDs and deployment. + mkdir -p dist + cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} + $(KUSTOMIZE) build config/default > dist/install.yaml + +##@ Deployment + +ifndef ignore-not-found + ignore-not-found = false +endif + +.PHONY: install +install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. + $(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f - + +.PHONY: uninstall +uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. + $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + +.PHONY: deploy +deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. + cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} + $(KUSTOMIZE) build config/default | $(KUBECTL) apply -f - + +.PHONY: undeploy +undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. + $(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + +##@ Dependencies + +## Location to install dependencies to +LOCALBIN ?= $(shell pwd)/bin +$(LOCALBIN): + mkdir -p $(LOCALBIN) + +## Tool Binaries +KUBECTL ?= kubectl +KIND ?= kind +KUSTOMIZE ?= $(LOCALBIN)/kustomize +CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen +ENVTEST ?= $(LOCALBIN)/setup-envtest +GOLANGCI_LINT = $(LOCALBIN)/golangci-lint + +## Tool Versions +KUSTOMIZE_VERSION ?= v5.6.0 +CONTROLLER_TOOLS_VERSION ?= v0.17.2 +#ENVTEST_VERSION is the version of controller-runtime release branch to fetch the envtest setup script (i.e. release-0.20) +ENVTEST_VERSION ?= $(shell go list -m -f "{{ .Version }}" sigs.k8s.io/controller-runtime | awk -F'[v.]' '{printf "release-%d.%d", $$2, $$3}') +#ENVTEST_K8S_VERSION is the version of Kubernetes to use for setting up ENVTEST binaries (i.e. 1.31) +ENVTEST_K8S_VERSION ?= $(shell go list -m -f "{{ .Version }}" k8s.io/api | awk -F'[v.]' '{printf "1.%d", $$3}') +GOLANGCI_LINT_VERSION ?= v1.63.4 + +.PHONY: kustomize +kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. +$(KUSTOMIZE): $(LOCALBIN) + $(call go-install-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION)) + +.PHONY: controller-gen +controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. +$(CONTROLLER_GEN): $(LOCALBIN) + $(call go-install-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen,$(CONTROLLER_TOOLS_VERSION)) + +.PHONY: setup-envtest +setup-envtest: envtest ## Download the binaries required for ENVTEST in the local bin directory. + @echo "Setting up envtest binaries for Kubernetes version $(ENVTEST_K8S_VERSION)..." + @$(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path || { \ + echo "Error: Failed to set up envtest binaries for version $(ENVTEST_K8S_VERSION)."; \ + exit 1; \ + } + +.PHONY: envtest +envtest: $(ENVTEST) ## Download setup-envtest locally if necessary. +$(ENVTEST): $(LOCALBIN) + $(call go-install-tool,$(ENVTEST),sigs.k8s.io/controller-runtime/tools/setup-envtest,$(ENVTEST_VERSION)) + +.PHONY: golangci-lint +golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary. +$(GOLANGCI_LINT): $(LOCALBIN) + $(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION)) + +# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist +# $1 - target path with name of binary +# $2 - package url which can be installed +# $3 - specific version of package +define go-install-tool +@[ -f "$(1)-$(3)" ] || { \ +set -e; \ +package=$(2)@$(3) ;\ +echo "Downloading $${package}" ;\ +rm -f $(1) || true ;\ +GOBIN=$(LOCALBIN) go install $${package} ;\ +mv $(1) $(1)-$(3) ;\ +} ;\ +ln -sf $(1)-$(3) $(1) +endef diff --git a/PROJECT b/PROJECT new file mode 100644 index 000000000..da579e2bf --- /dev/null +++ b/PROJECT @@ -0,0 +1,20 @@ +# Code generated by tool. DO NOT EDIT. +# This file is used to track the info used to scaffold your project +# and allow the plugins properly work. +# More info: https://book.kubebuilder.io/reference/project-config.html +domain: ai +layout: +- go.kubebuilder.io/v4 +projectName: inferno-autoscaler +repo: github.com/llm-d-incubation/inferno-autoscaler +resources: +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: ai + group: llmd + kind: VariantAutoscaling + path: github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1 + version: v1alpha1 +version: "3" diff --git a/README.md b/README.md index c9dfd5567..93decc75f 100644 --- a/README.md +++ b/README.md @@ -1,155 +1,254 @@ -# Inference system optimizer +# inferno-autoscaler +The inferno-autoscaler assigns GPU types to inference model servers and decides on the number of replicas for each model for a given request traffic load and classes of service, as well as the batch size. -The inference system optimizer assigns GPU types to inference model servers and decides on the number of replicas for each model for a given request traffic load and classes of service, as well as the batch size. ([slides](docs/slides/inferno-dynamic.pdf)) +## Description -## Building +The inferno-autoscaler is a Kubernetes controller that performs optimizated autoscaling using the below components: -```bash -docker build -t inferno . --load +![Diagram](docs/diagrams/inferno-WVA-design.png) + +Reconciler: + +The controller is implemented using the controller-runtime framework, which reconciles the namespace-scoped VariantAutoscaling objects created by the platform administrator, one per model.Due to runtime variability in model behavior (e.g., differences in prompt lengths, output sizes, or server-level contention), we treat model analysis as a continuously reconciled step during every autoscaler loop. + +Collector(s): +The collectors that gather cluster data about the cluster state and the state of vllm servers running inside the controller. + +Actuator: +The actuator is responsible for emitting metrics to the desired sources, like Prometheus, or changing replicas of existing deployments running on the cluster, which is the case with the Inferno autoscaler. + +Model Analyzer: +Model Analyzer is a component that runs per model to perform scaling, estimation, prediction, and tuning. + +Proposed sources: +These include the new [API proposal](https://docs.google.com/document/d/1j2KRAT68_FYxq1iVzG0xVL-DHQhGVUZBqiM22Hd_0hc/edit?usp=drivesdk&resourcekey=0-5cSovS8QcRQNYXj0_kRMiw), which is expected to work in conjunction with the inference scheduler (EPP) to provide insights into the request scheduler's dispatching logic. + +For more details please refer to the community proposal [here](https://docs.google.com/document/d/1n6SAhloQaoSyF2k3EveIOerT-f97HuWXTLFm07xcvqk/edit?tab=t.0). + +## Getting Started + +### Prerequisites +- go version v1.23.0+ +- docker version 17.03+. +- kubectl version v1.11.3+. +- Access to a Kubernetes v1.11.3+ cluster. + +### Create cluster with fake GPUs + +```sh +bash deploy/local-cluster.sh +``` + +### To Deploy on the cluster +**Build and push your image to the location specified by `IMG`:** + +```sh +make docker-build docker-push IMG=/inferno-autoscaler:tag +``` + +**NOTE:** This image ought to be published in the personal registry you specified. +And it is required to have access to pull the image from the working environment. +Make sure you have the proper permission to the registry if the above commands don’t work. + +**Install the CRDs into the cluster:** + +```sh +make install ``` -## Prerequisites +**Install the configmap to run optimizer loop:** -- lp_solve Mixed Integer Linear Programming (MILP) solver +```sh +kubectl apply -f deploy/ticker-configmap.yaml +``` - [Installation instructions and code](https://github.com/llm-inferno/lpsolve) - -- IBM CPLEX (optional) +**Deploy the Manager to the cluster with the image specified by `IMG`:** - Information and instructions [IBM CPLEX as a solver](https://github.com/llm-inferno/lpsolve/tree/main/cplex) +```sh +make deploy IMG=/inferno-autoscaler:tag -## Running +# prebuilt image +# make deploy IMG=quay.io/amalvank/inferno:latest +``` -First, install [prerequisites](#prerequisites) if running locally (not using an image). +> **NOTE**: If you encounter RBAC errors, you may need to grant yourself cluster-admin +privileges or be logged in as admin. -### I. Optimizer only -There are two ways to run the optimizer. +### To Uninstall -1. **Direct function calls**: An example is provided in [main.go](demos/main/main.go). +**Delete the APIs(CRDs) from the cluster:** - ```bash - cd demos/main - go run main.go - ``` +```sh +make uninstall +``` -2. **REST API server**: The optimizer may run as a REST API server ([steps](#steps-to-run-the-optimizer-as-a-rest-api-server)). +**UnDeploy the controller from the cluster:** -### II. Optimized auto-scaler +```sh +make undeploy +``` -One may run the optimizer as part of an auto-scaling control system, in one of two ways. +**Delete cluster** -1. **Kubernetes controller**: Running in a Kubernetes cluster and using custom resources and a Kubernetes runtime controller, the optimizer may be excercised in reconciliation to updates to the Optimizer custom resource ([reference](https://github.com/llm-inferno/controller)). +```sh +kind delete cluster -n a100-cluster +``` -2. **Optimization control loop**: The control loop comprises (1) a Collector to get data about the inference servers through Prometheus and server deployments, (2) an Optimizer to make decisions, (3) an Actuator to realize such decisions by updating server deployments, and (4) a periodic Controller that has access to static and dynamic data. The [control loop](https://github.com/llm-inferno/control-loop) may run either externally or in a Kubernetes cluster. +## Local development -### Steps to run the optimizer as a REST API server +Local development will need emulated vllm server, prometheus installed in KinD cluster. -The REST API specifications are [documented](rest-server/README.md). +**Create namespace** -Clone this repository and set environment variable `INFERNO_REPO` to the path to it. +```sh +kubectl create ns monitoring +``` -#### Option A: Run externally +**Install prometheus** -```bash -cd $INFERNO_REPO/cmd/optimizer -go run main.go [-F] +```sh +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack -n monitoring ``` -The default is to run the server in **Stateless** mode. Use the optional `-F` argument to run in **Statefull** mode. ([Description of modes](rest-server/README.md#rest-server-modes)) +**Wait for prometheus installation to complete** +```sh +kubectl apply -f samples/local-dev/prometheus-deploy-all-in-one.yaml +kubectl get -n default prometheus prometheus -w +kubectl get services + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +prometheus-operated ClusterIP None 9090/TCP 17s + +``` -You may then curl [API commands](rest-server/README.md#commands-list) to `http://localhost:8080`. +**Access the server** -#### Option B: Run in cluster +```sh +kubectl port-forward svc/prometheus-operated 9090:9090 +# server can be accessed at location: http://localhost:9090 +``` -- Deploy optimizer as a deployment, along with a service on port `80`, in name space `inferno` in the cluster. (The deployment yaml file starts the server in a container with the `-F` flag.) +**Create vllm emulated deployment** - ```bash - cd $INFERNO_REPO/manifests/yamls - kubectl apply -f deploy-optimizer.yaml - ``` +```sh +kubectl apply -f samples/local-dev/vllme-deployment-with-service-and-servicemon.yaml -- Forward port to local host. +kubectl get deployments +NAME READY UP-TO-DATE AVAILABLE AGE +vllme-deployment 1/1 1 1 35s - ```bash - kubectl port-forward service/inferno-optimizer -n inferno 8080:80 - ``` +kubectl port-forward svc/vllme-service 8000:80 +``` - You may then curl API commands (above) to `http://localhost:8080`. +**Load generation** -- (Optional) Inspect logs. +```sh +git clone https://github.com/vishakha-ramani/vllm_emulator.git -b new-metric - ```bash - POD=$(kubectl get pod -l app=inferno-optimizer -n inferno -o jsonpath="{.items[0].metadata.name}") - kubectl logs -f $POD -n inferno - ``` +#run script +sh ./loadgen.sh -- Cleanup. +``` - ```bash - kubectl delete -f deploy-optimizer.yaml - ``` +**Run sample query** -## Detailed description of the optimizer +```sh +curl -G http://localhost:9090/api/v1/query \ + --data-urlencode 'query=sum(rate(vllm:requests_count_total[1m])) * 60' -![problem-scope](docs/figs/Slide5.png) +curl -G http://localhost:9090/api/v1/query \ + --data-urlencode 'query=sum(rate(vllm:requests_count_total[1m])) * 60' +{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1752075000.160,"9.333333333333332"]}]}}% -![timing-definitions](docs/figs/Slide30.png) +``` -![request-batching](docs/figs/Slide6.png) +**Accessing the Grafana** -![token-time-fitting](docs/figs/Slide7.png) -![modeling-batching](docs/figs/Slide9.png) +```sh +# username:admin +# password: prom-operator +kubectl port-forward svc/kube-prometheus-stack-grafana 3000:80 -n monitoring +``` -![qn-model](docs/figs/Slide8.png) +**Creating dummy workload** + ```sh + kubectl apply -f samples/local-dev/vllme-deployment-with-service-and-servicemon.yaml + ``` +**Creating variant autoscaling object for controller** +```sh +kubectl apply -f samples/local-dev/vllme-variantautoscaling.yaml -![system-occupancy](docs/figs/Slide32.png) +# view status of the variant autoscaling object to get status of optimization +``` +## Project Distribution -![impact-batch](docs/figs/Slide33.png) +Following the options to release and provide this solution to the users. -![target-service](docs/figs/Slide34.png) +### By providing a bundle with all YAML files -Decision variables +1. Build the installer for the image built and published in the registry: -For each pair of (class of service, model): +```sh +make build-installer IMG=/inferno-autoscaler:tag +``` -- gpuProfile: the GPU type allocated -- numReplicas: the number of replicas -- batchSize: the batch size, given continuous batching +**NOTE:** The makefile target mentioned above generates an 'install.yaml' +file in the dist directory. This file contains all the resources built +with Kustomize, which are necessary to install this project without its +dependencies. -## Specifications: Accelerators and models +2. Using the installer -![accelerators](docs/figs/Slide13.png) +Users can just run 'kubectl apply -f ' to install +the project, i.e.: -![models](docs/figs/Slide14.png) +```sh +kubectl apply -f https://raw.githubusercontent.com//inferno-autoscaler//dist/install.yaml +``` -## Example 1: Unlimited accelerators +### By providing a Helm Chart -![unlimited-assign](docs/figs/Slide16.png) +1. Build the chart using the optional helm plugin -![unlimited-perf](docs/figs/Slide17.png) +```sh +kubebuilder edit --plugins=helm/v1-alpha +``` -## Example 2: Load change - Unlimited accelerators +2. See that a chart was generated under 'dist/chart', and users +can obtain this solution from there. -![unlimited-change-assign](docs/figs/Slide19.png) +**NOTE:** If you change the project, you need to update the Helm Chart +using the same command above to sync the latest changes. Furthermore, +if you create webhooks, you need to use the above command with +the '--force' flag and manually ensure that any custom configuration +previously added to 'dist/chart/values.yaml' or 'dist/chart/manager/manager.yaml' +is manually re-applied afterwards. -![unlimited-change](docs/figs/Slide20.png) +## Contributing -![unlimited-change-perf](docs/figs/Slide21.png) +Please join llmd autoscaling community meetings and feel free to submit github issues and PRs. -## Example 3: Limited accelerators +**NOTE:** Run `make help` for more information on all potential `make` targets -![limited-count](docs/figs/Slide22.png) +More information can be found via the [Kubebuilder Documentation](https://book.kubebuilder.io/introduction.html) -![limited-assign](docs/figs/Slide23.png) +## License -![limited-perf](docs/figs/Slide24.png) +Copyright 2025. -## Example 4: Load change - Limited accelerators +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -![limited-change-assign](docs/figs/Slide26.png) + http://www.apache.org/licenses/LICENSE-2.0 -![limited-change](docs/figs/Slide27.png) +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. -![limited-change-perf](docs/figs/Slide28.png) diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha1/groupversion_info.go new file mode 100644 index 000000000..85e0657a1 --- /dev/null +++ b/api/v1alpha1/groupversion_info.go @@ -0,0 +1,36 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the llmd v1alpha1 API group. +// +kubebuilder:object:generate=true +// +groupName=llmd.ai +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects. + GroupVersion = schema.GroupVersion{Group: "llmd.ai", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme. + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/api/v1alpha1/variantautoscaling_types.go b/api/v1alpha1/variantautoscaling_types.go new file mode 100644 index 000000000..83d07b428 --- /dev/null +++ b/api/v1alpha1/variantautoscaling_types.go @@ -0,0 +1,128 @@ +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:shortName=opt +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=".spec.modelID" +// +kubebuilder:printcolumn:name="Current",type=string,JSONPath=".status.currentAlloc.accelerator" +// +kubebuilder:printcolumn:name="Desired",type=string,JSONPath=".status.desiredOptimizedAlloc.accelerator" +// +kubebuilder:printcolumn:name="Replicas",type=integer,JSONPath=".status.currentAlloc.numReplicas" +// +kubebuilder:printcolumn:name="Actuated",type=string,JSONPath=".status.actuation.applied" +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type VariantAutoscalingSpec struct { + // +kubebuilder:validation:MinLength=1 + ModelID string `json:"modelID"` + + SLOClassRef ConfigMapKeyRef `json:"sloClassRef"` + ModelProfile ModelProfile `json:"modelProfile"` +} + +type ConfigMapKeyRef struct { + // +kubebuilder:validation:MinLength=1 + Name string `json:"name"` + + // +kubebuilder:validation:MinLength=1 + Key string `json:"key"` +} + +type ModelProfile struct { + // +kubebuilder:validation:MinItems=1 + Accelerators []AcceleratorProfile `json:"accelerators"` +} + +type AcceleratorProfile struct { + // +kubebuilder:validation:MinLength=1 + Acc string `json:"acc"` + + // +kubebuilder:validation:Minimum=1 + AccCount int `json:"accCount"` + + // +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$` + Alpha string `json:"alpha"` + + // +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$` + Beta string `json:"beta"` + + // +kubebuilder:validation:Minimum=1 + MaxBatchSize int `json:"maxBatchSize"` + + // +kubebuilder:validation:Minimum=1 + AtTokens int `json:"atTokens"` +} + +type VariantAutoscalingStatus struct { + CurrentAlloc Allocation `json:"currentAlloc,omitempty"` + DesiredOptimizedAlloc OptimizedAlloc `json:"desiredOptimizedAlloc,omitempty"` + Actuation ActuationStatus `json:"actuation,omitempty"` +} + +type Allocation struct { + // +kubebuilder:validation:MinLength=1 + Accelerator string `json:"accelerator"` + + // +kubebuilder:validation:Minimum=0 + NumReplicas int `json:"numReplicas"` + + // +kubebuilder:validation:Minimum=0 + MaxBatch int `json:"maxBatch"` + + // +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$` + VariantCost string `json:"variantCost"` + + // +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$` + ITLAverage string `json:"itlAverage"` + + // +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$` + WaitAverage string `json:"waitAverage"` + + Load LoadProfile `json:"load"` +} + +type LoadProfile struct { + ArrivalRate string `json:"arrivalRate"` + + AvgLength string `json:"avgLength"` +} + +type OptimizedAlloc struct { + LastRunTime metav1.Time `json:"lastRunTime,omitempty"` + + // +kubebuilder:validation:MinLength=2 + Accelerator string `json:"accelerator"` + + // +kubebuilder:validation:Minimum=0 + NumReplicas int `json:"numReplicas"` +} + +type ActuationStatus struct { + Applied bool `json:"applied"` + LastAttemptTime metav1.Time `json:"lastAttemptTime,omitempty"` + LastSuccessTime metav1.Time `json:"lastSuccessTime,omitempty"` +} + +// +kubebuilder:object:root=true + +type VariantAutoscaling struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec VariantAutoscalingSpec `json:"spec,omitempty"` + Status VariantAutoscalingStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +type VariantAutoscalingList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []VariantAutoscaling `json:"items"` +} + +func init() { + SchemeBuilder.Register(&VariantAutoscaling{}, &VariantAutoscalingList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 000000000..85dcaa8c4 --- /dev/null +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,233 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AcceleratorProfile) DeepCopyInto(out *AcceleratorProfile) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcceleratorProfile. +func (in *AcceleratorProfile) DeepCopy() *AcceleratorProfile { + if in == nil { + return nil + } + out := new(AcceleratorProfile) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ActuationStatus) DeepCopyInto(out *ActuationStatus) { + *out = *in + in.LastAttemptTime.DeepCopyInto(&out.LastAttemptTime) + in.LastSuccessTime.DeepCopyInto(&out.LastSuccessTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ActuationStatus. +func (in *ActuationStatus) DeepCopy() *ActuationStatus { + if in == nil { + return nil + } + out := new(ActuationStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Allocation) DeepCopyInto(out *Allocation) { + *out = *in + out.Load = in.Load +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Allocation. +func (in *Allocation) DeepCopy() *Allocation { + if in == nil { + return nil + } + out := new(Allocation) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ConfigMapKeyRef) DeepCopyInto(out *ConfigMapKeyRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ConfigMapKeyRef. +func (in *ConfigMapKeyRef) DeepCopy() *ConfigMapKeyRef { + if in == nil { + return nil + } + out := new(ConfigMapKeyRef) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LoadProfile) DeepCopyInto(out *LoadProfile) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LoadProfile. +func (in *LoadProfile) DeepCopy() *LoadProfile { + if in == nil { + return nil + } + out := new(LoadProfile) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelProfile) DeepCopyInto(out *ModelProfile) { + *out = *in + if in.Accelerators != nil { + in, out := &in.Accelerators, &out.Accelerators + *out = make([]AcceleratorProfile, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelProfile. +func (in *ModelProfile) DeepCopy() *ModelProfile { + if in == nil { + return nil + } + out := new(ModelProfile) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OptimizedAlloc) DeepCopyInto(out *OptimizedAlloc) { + *out = *in + in.LastRunTime.DeepCopyInto(&out.LastRunTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OptimizedAlloc. +func (in *OptimizedAlloc) DeepCopy() *OptimizedAlloc { + if in == nil { + return nil + } + out := new(OptimizedAlloc) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VariantAutoscaling) DeepCopyInto(out *VariantAutoscaling) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VariantAutoscaling. +func (in *VariantAutoscaling) DeepCopy() *VariantAutoscaling { + if in == nil { + return nil + } + out := new(VariantAutoscaling) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *VariantAutoscaling) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VariantAutoscalingList) DeepCopyInto(out *VariantAutoscalingList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]VariantAutoscaling, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VariantAutoscalingList. +func (in *VariantAutoscalingList) DeepCopy() *VariantAutoscalingList { + if in == nil { + return nil + } + out := new(VariantAutoscalingList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *VariantAutoscalingList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VariantAutoscalingSpec) DeepCopyInto(out *VariantAutoscalingSpec) { + *out = *in + out.SLOClassRef = in.SLOClassRef + in.ModelProfile.DeepCopyInto(&out.ModelProfile) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VariantAutoscalingSpec. +func (in *VariantAutoscalingSpec) DeepCopy() *VariantAutoscalingSpec { + if in == nil { + return nil + } + out := new(VariantAutoscalingSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VariantAutoscalingStatus) DeepCopyInto(out *VariantAutoscalingStatus) { + *out = *in + out.CurrentAlloc = in.CurrentAlloc + in.DesiredOptimizedAlloc.DeepCopyInto(&out.DesiredOptimizedAlloc) + in.Actuation.DeepCopyInto(&out.Actuation) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VariantAutoscalingStatus. +func (in *VariantAutoscalingStatus) DeepCopy() *VariantAutoscalingStatus { + if in == nil { + return nil + } + out := new(VariantAutoscalingStatus) + in.DeepCopyInto(out) + return out +} diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 000000000..b017b8e61 --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,244 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "crypto/tls" + "flag" + "os" + "path/filepath" + + // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) + // to ensure that exec-entrypoint and run can make use of them. + _ "k8s.io/client-go/plugin/pkg/client/auth" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/certwatcher" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" + + llmdv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" + "github.com/llm-d-incubation/inferno-autoscaler/internal/controller" + // +kubebuilder:scaffold:imports +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + utilruntime.Must(llmdv1alpha1.AddToScheme(scheme)) + // +kubebuilder:scaffold:scheme +} + +// nolint:gocyclo +func main() { + var metricsAddr string + var metricsCertPath, metricsCertName, metricsCertKey string + var webhookCertPath, webhookCertName, webhookCertKey string + var enableLeaderElection bool + var probeAddr string + var secureMetrics bool + var enableHTTP2 bool + var tlsOpts []func(*tls.Config) + flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + flag.BoolVar(&enableLeaderElection, "leader-elect", false, + "Enable leader election for controller manager. "+ + "Enabling this will ensure there is only one active controller manager.") + flag.BoolVar(&secureMetrics, "metrics-secure", true, + "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") + flag.StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.") + flag.StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.") + flag.StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.") + flag.StringVar(&metricsCertPath, "metrics-cert-path", "", + "The directory that contains the metrics server certificate.") + flag.StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.") + flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") + flag.BoolVar(&enableHTTP2, "enable-http2", false, + "If set, HTTP/2 will be enabled for the metrics and webhook servers") + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + // if the enable-http2 flag is false (the default), http/2 should be disabled + // due to its vulnerabilities. More specifically, disabling http/2 will + // prevent from being vulnerable to the HTTP/2 Stream Cancellation and + // Rapid Reset CVEs. For more information see: + // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 + // - https://github.com/advisories/GHSA-4374-p667-p6c8 + disableHTTP2 := func(c *tls.Config) { + setupLog.Info("disabling http/2") + c.NextProtos = []string{"http/1.1"} + } + + if !enableHTTP2 { + tlsOpts = append(tlsOpts, disableHTTP2) + } + + // Create watchers for metrics and webhooks certificates + var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher + + // Initial webhook TLS options + webhookTLSOpts := tlsOpts + + if len(webhookCertPath) > 0 { + setupLog.Info("Initializing webhook certificate watcher using provided certificates", + "webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey) + + var err error + webhookCertWatcher, err = certwatcher.New( + filepath.Join(webhookCertPath, webhookCertName), + filepath.Join(webhookCertPath, webhookCertKey), + ) + if err != nil { + setupLog.Error(err, "Failed to initialize webhook certificate watcher") + os.Exit(1) + } + + webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) { + config.GetCertificate = webhookCertWatcher.GetCertificate + }) + } + + webhookServer := webhook.NewServer(webhook.Options{ + TLSOpts: webhookTLSOpts, + }) + + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: metricsAddr, + SecureServing: secureMetrics, + TLSOpts: tlsOpts, + } + + if secureMetrics { + // FilterProvider is used to protect the metrics endpoint with authn/authz. + // These configurations ensure that only authorized users and service accounts + // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info: + // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/filters#WithAuthenticationAndAuthorization + metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization + } + + // If the certificate is not specified, controller-runtime will automatically + // generate self-signed certificates for the metrics server. While convenient for development and testing, + // this setup is not recommended for production. + // + // TODO(user): If you enable certManager, uncomment the following lines: + // - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates + // managed by cert-manager for the metrics server. + // - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification. + if len(metricsCertPath) > 0 { + setupLog.Info("Initializing metrics certificate watcher using provided certificates", + "metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey) + + var err error + metricsCertWatcher, err = certwatcher.New( + filepath.Join(metricsCertPath, metricsCertName), + filepath.Join(metricsCertPath, metricsCertKey), + ) + if err != nil { + setupLog.Error(err, "to initialize metrics certificate watcher", "error", err) + os.Exit(1) + } + + metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) { + config.GetCertificate = metricsCertWatcher.GetCertificate + }) + } + + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: scheme, + Metrics: metricsServerOptions, + WebhookServer: webhookServer, + HealthProbeBindAddress: probeAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "72dd1cf1.llm-d.ai", + // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily + // when the Manager ends. This requires the binary to immediately end when the + // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly + // speeds up voluntary leader transitions as the new leader don't have to wait + // LeaseDuration time first. + // + // In the default scaffold provided, the program ends immediately after + // the manager stops, so would be fine to enable this option. However, + // if you are doing or is intended to do any operation such as perform cleanups + // after the manager stops then its usage might be unsafe. + // LeaderElectionReleaseOnCancel: true, + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + + if err = (&controller.VariantAutoscalingReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "variantautoscaling") + os.Exit(1) + } + // +kubebuilder:scaffold:builder + + if metricsCertWatcher != nil { + setupLog.Info("Adding metrics certificate watcher to manager") + if err := mgr.Add(metricsCertWatcher); err != nil { + setupLog.Error(err, "unable to add metrics certificate watcher to manager") + os.Exit(1) + } + } + + if webhookCertWatcher != nil { + setupLog.Info("Adding webhook certificate watcher to manager") + if err := mgr.Add(webhookCertWatcher); err != nil { + setupLog.Error(err, "unable to add webhook certificate watcher to manager") + os.Exit(1) + } + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + os.Exit(1) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + os.Exit(1) + } + + setupLog.Info("starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "problem running manager") + os.Exit(1) + } +} diff --git a/cmd/optimizer/main.go b/cmd/optimizer/main.go deleted file mode 100644 index 510079c40..000000000 --- a/cmd/optimizer/main.go +++ /dev/null @@ -1,20 +0,0 @@ -package main - -import ( - "os" - - rest "github.com/llm-inferno/inferno/rest-server" -) - -// create and run a REST API Optimizer server -// - stateless (default) or statefull (with -F argument) -func main() { - var server rest.RESTServer - statefull := len(os.Args) > 1 && os.Args[1] == rest.DefaultStatefull - if statefull { - server = rest.NewStateFullServer() - } else { - server = rest.NewStateLessServer() - } - server.Run() -} diff --git a/config/crd/bases/llmd.ai_variantautoscalings.yaml b/config/crd/bases/llmd.ai_variantautoscalings.yaml new file mode 100644 index 000000000..ae3c041b5 --- /dev/null +++ b/config/crd/bases/llmd.ai_variantautoscalings.yaml @@ -0,0 +1,168 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: variantautoscalings.llmd.ai +spec: + group: llmd.ai + names: + kind: VariantAutoscaling + listKind: VariantAutoscalingList + plural: variantautoscalings + singular: variantautoscaling + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + modelID: + minLength: 1 + type: string + modelProfile: + properties: + accelerators: + items: + properties: + acc: + minLength: 1 + type: string + accCount: + minimum: 1 + type: integer + alpha: + pattern: ^\d+(\.\d+)?$ + type: string + atTokens: + minimum: 1 + type: integer + beta: + pattern: ^\d+(\.\d+)?$ + type: string + maxBatchSize: + minimum: 1 + type: integer + required: + - acc + - accCount + - alpha + - atTokens + - beta + - maxBatchSize + type: object + minItems: 1 + type: array + required: + - accelerators + type: object + sloClassRef: + properties: + key: + minLength: 1 + type: string + name: + minLength: 1 + type: string + required: + - key + - name + type: object + required: + - modelID + - modelProfile + - sloClassRef + type: object + status: + properties: + actuation: + properties: + applied: + type: boolean + lastAttemptTime: + format: date-time + type: string + lastSuccessTime: + format: date-time + type: string + required: + - applied + type: object + currentAlloc: + properties: + accelerator: + minLength: 1 + type: string + itlAverage: + pattern: ^\d+(\.\d+)?$ + type: string + load: + properties: + arrivalRate: + type: string + avgLength: + type: string + required: + - arrivalRate + - avgLength + type: object + maxBatch: + minimum: 0 + type: integer + numReplicas: + minimum: 0 + type: integer + variantCost: + pattern: ^\d+(\.\d+)?$ + type: string + waitAverage: + pattern: ^\d+(\.\d+)?$ + type: string + required: + - accelerator + - itlAverage + - load + - maxBatch + - numReplicas + - variantCost + - waitAverage + type: object + desiredOptimizedAlloc: + properties: + accelerator: + minLength: 2 + type: string + lastRunTime: + format: date-time + type: string + numReplicas: + minimum: 0 + type: integer + required: + - accelerator + - numReplicas + type: object + type: object + type: object + served: true + storage: true diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml new file mode 100644 index 000000000..04ba6189f --- /dev/null +++ b/config/crd/kustomization.yaml @@ -0,0 +1,16 @@ +# This kustomization.yaml is not intended to be run by itself, +# since it depends on service name and namespace that are out of this kustomize package. +# It should be run by config/default +resources: +- bases/llmd.ai_variantautoscalings.yaml +# +kubebuilder:scaffold:crdkustomizeresource + +#patches: +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. +# patches here are for enabling the conversion webhook for each CRD +# +kubebuilder:scaffold:crdkustomizewebhookpatch + +# [WEBHOOK] To enable webhook, uncomment the following section +# the following config is for teaching kustomize how to do kustomization for CRDs. +#configurations: +#- kustomizeconfig.yaml diff --git a/config/crd/kustomizeconfig.yaml b/config/crd/kustomizeconfig.yaml new file mode 100644 index 000000000..ec5c150a9 --- /dev/null +++ b/config/crd/kustomizeconfig.yaml @@ -0,0 +1,19 @@ +# This file is for teaching kustomize how to substitute name and namespace reference in CRD +nameReference: +- kind: Service + version: v1 + fieldSpecs: + - kind: CustomResourceDefinition + version: v1 + group: apiextensions.k8s.io + path: spec/conversion/webhook/clientConfig/service/name + +namespace: +- kind: CustomResourceDefinition + version: v1 + group: apiextensions.k8s.io + path: spec/conversion/webhook/clientConfig/service/namespace + create: false + +varReference: +- path: metadata/annotations diff --git a/config/default/cert_metrics_manager_patch.yaml b/config/default/cert_metrics_manager_patch.yaml new file mode 100644 index 000000000..d97501553 --- /dev/null +++ b/config/default/cert_metrics_manager_patch.yaml @@ -0,0 +1,30 @@ +# This patch adds the args, volumes, and ports to allow the manager to use the metrics-server certs. + +# Add the volumeMount for the metrics-server certs +- op: add + path: /spec/template/spec/containers/0/volumeMounts/- + value: + mountPath: /tmp/k8s-metrics-server/metrics-certs + name: metrics-certs + readOnly: true + +# Add the --metrics-cert-path argument for the metrics server +- op: add + path: /spec/template/spec/containers/0/args/- + value: --metrics-cert-path=/tmp/k8s-metrics-server/metrics-certs + +# Add the metrics-server certs volume configuration +- op: add + path: /spec/template/spec/volumes/- + value: + name: metrics-certs + secret: + secretName: metrics-server-cert + optional: false + items: + - key: ca.crt + path: ca.crt + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml new file mode 100644 index 000000000..931d48530 --- /dev/null +++ b/config/default/kustomization.yaml @@ -0,0 +1,234 @@ +# Adds namespace to all resources. +namespace: inferno-autoscaler-system + +# Value of this field is prepended to the +# names of all resources, e.g. a deployment named +# "wordpress" becomes "alices-wordpress". +# Note that it should also match with the prefix (text before '-') of the namespace +# field above. +namePrefix: inferno-autoscaler- + +# Labels to add to all resources and selectors. +#labels: +#- includeSelectors: true +# pairs: +# someName: someValue + +resources: +- ../crd +- ../rbac +- ../manager +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in +# crd/kustomization.yaml +#- ../webhook +# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. +#- ../certmanager +# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. +#- ../prometheus +# [METRICS] Expose the controller manager metrics service. +- metrics_service.yaml +# [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. +# Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. +# Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will +# be able to communicate with the Webhook Server. +#- ../network-policy + +# Uncomment the patches line if you enable Metrics +patches: +# [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443. +# More info: https://book.kubebuilder.io/reference/metrics +- path: manager_metrics_patch.yaml + target: + kind: Deployment + +# Uncomment the patches line if you enable Metrics and CertManager +# [METRICS-WITH-CERTS] To enable metrics protected with certManager, uncomment the following line. +# This patch will protect the metrics with certManager self-signed certs. +#- path: cert_metrics_manager_patch.yaml +# target: +# kind: Deployment + +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in +# crd/kustomization.yaml +#- path: manager_webhook_patch.yaml +# target: +# kind: Deployment + +# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. +# Uncomment the following replacements to add the cert-manager CA injection annotations +#replacements: +# - source: # Uncomment the following block to enable certificates for metrics +# kind: Service +# version: v1 +# name: controller-manager-metrics-service +# fieldPath: metadata.name +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: metrics-certs +# fieldPaths: +# - spec.dnsNames.0 +# - spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 0 +# create: true +# - select: # Uncomment the following to set the Service name for TLS config in Prometheus ServiceMonitor +# kind: ServiceMonitor +# group: monitoring.coreos.com +# version: v1 +# name: controller-manager-metrics-monitor +# fieldPaths: +# - spec.endpoints.0.tlsConfig.serverName +# options: +# delimiter: '.' +# index: 0 +# create: true +# +# - source: +# kind: Service +# version: v1 +# name: controller-manager-metrics-service +# fieldPath: metadata.namespace +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: metrics-certs +# fieldPaths: +# - spec.dnsNames.0 +# - spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 1 +# create: true +# - select: # Uncomment the following to set the Service namespace for TLS in Prometheus ServiceMonitor +# kind: ServiceMonitor +# group: monitoring.coreos.com +# version: v1 +# name: controller-manager-metrics-monitor +# fieldPaths: +# - spec.endpoints.0.tlsConfig.serverName +# options: +# delimiter: '.' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have any webhook +# kind: Service +# version: v1 +# name: webhook-service +# fieldPath: .metadata.name # Name of the service +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPaths: +# - .spec.dnsNames.0 +# - .spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 0 +# create: true +# - source: +# kind: Service +# version: v1 +# name: webhook-service +# fieldPath: .metadata.namespace # Namespace of the service +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPaths: +# - .spec.dnsNames.0 +# - .spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: +# - select: +# kind: ValidatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 0 +# create: true +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.name +# targets: +# - select: +# kind: ValidatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting ) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: +# - select: +# kind: MutatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 0 +# create: true +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.name +# targets: +# - select: +# kind: MutatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a ConversionWebhook (--conversion) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. +# +kubebuilder:scaffold:crdkustomizecainjectionns +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.name +# targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. +# +kubebuilder:scaffold:crdkustomizecainjectionname diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml new file mode 100644 index 000000000..2aaef6536 --- /dev/null +++ b/config/default/manager_metrics_patch.yaml @@ -0,0 +1,4 @@ +# This patch adds the args to allow exposing the metrics endpoint using HTTPS +- op: add + path: /spec/template/spec/containers/0/args/0 + value: --metrics-bind-address=:8443 diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml new file mode 100644 index 000000000..217ec92c0 --- /dev/null +++ b/config/default/metrics_service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: controller-manager-metrics-service + namespace: system +spec: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml new file mode 100644 index 000000000..02596d175 --- /dev/null +++ b/config/manager/kustomization.yaml @@ -0,0 +1,8 @@ +resources: +- manager.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +images: +- name: controller + newName: quay.io/amalvank/inferno + newTag: latest diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml new file mode 100644 index 000000000..4ca0f7e13 --- /dev/null +++ b/config/manager/manager.yaml @@ -0,0 +1,98 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: system +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system + labels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize +spec: + selector: + matchLabels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler + replicas: 1 + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler + spec: + # TODO(user): Uncomment the following code to configure the nodeAffinity expression + # according to the platforms which are supported by your solution. + # It is considered best practice to support multiple architectures. You can + # build your manager image using the makefile target docker-buildx. + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: kubernetes.io/arch + # operator: In + # values: + # - amd64 + # - arm64 + # - ppc64le + # - s390x + # - key: kubernetes.io/os + # operator: In + # values: + # - linux + securityContext: + # Projects are configured by default to adhere to the "restricted" Pod Security Standards. + # This ensures that deployments meet the highest security requirements for Kubernetes. + # For more details, see: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - command: + - /manager + args: + - --leader-elect + - --health-probe-bind-address=:8081 + image: controller:latest + name: manager + ports: [] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + # TODO(user): Configure the resources accordingly based on the project requirements. + # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + volumeMounts: [] + volumes: [] + serviceAccountName: controller-manager + terminationGracePeriodSeconds: 10 diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml new file mode 100644 index 000000000..688cc37e0 --- /dev/null +++ b/config/network-policy/allow-metrics-traffic.yaml @@ -0,0 +1,27 @@ +# This NetworkPolicy allows ingress traffic +# with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those +# namespaces are able to gather data from the metrics endpoint. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: allow-metrics-traffic + namespace: system +spec: + podSelector: + matchLabels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler + policyTypes: + - Ingress + ingress: + # This allows ingress traffic from any namespace with the label metrics: enabled + - from: + - namespaceSelector: + matchLabels: + metrics: enabled # Only from namespaces with this label + ports: + - port: 8443 + protocol: TCP diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml new file mode 100644 index 000000000..ec0fb5e57 --- /dev/null +++ b/config/network-policy/kustomization.yaml @@ -0,0 +1,2 @@ +resources: +- allow-metrics-traffic.yaml diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml new file mode 100644 index 000000000..fdc5481b1 --- /dev/null +++ b/config/prometheus/kustomization.yaml @@ -0,0 +1,11 @@ +resources: +- monitor.yaml + +# [PROMETHEUS-WITH-CERTS] The following patch configures the ServiceMonitor in ../prometheus +# to securely reference certificates created and managed by cert-manager. +# Additionally, ensure that you uncomment the [METRICS WITH CERTMANAGER] patch under config/default/kustomization.yaml +# to mount the "metrics-server-cert" secret in the Manager Deployment. +#patches: +# - path: monitor_tls_patch.yaml +# target: +# kind: ServiceMonitor diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml new file mode 100644 index 000000000..5eb5f4c02 --- /dev/null +++ b/config/prometheus/monitor.yaml @@ -0,0 +1,27 @@ +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: controller-manager-metrics-monitor + namespace: system +spec: + endpoints: + - path: /metrics + port: https # Ensure this is the name of the port that exposes HTTPS metrics + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables + # certificate verification, exposing the system to potential man-in-the-middle attacks. + # For production environments, it is recommended to use cert-manager for automatic TLS certificate management. + # To apply this configuration, enable cert-manager and use the patch located at config/prometheus/servicemonitor_tls_patch.yaml, + # which securely references the certificate from the 'metrics-server-cert' secret. + insecureSkipVerify: true + selector: + matchLabels: + control-plane: controller-manager + app.kubernetes.io/name: inferno-autoscaler diff --git a/config/prometheus/monitor_tls_patch.yaml b/config/prometheus/monitor_tls_patch.yaml new file mode 100644 index 000000000..5bf84ce0d --- /dev/null +++ b/config/prometheus/monitor_tls_patch.yaml @@ -0,0 +1,19 @@ +# Patch for Prometheus ServiceMonitor to enable secure TLS configuration +# using certificates managed by cert-manager +- op: replace + path: /spec/endpoints/0/tlsConfig + value: + # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize + serverName: SERVICE_NAME.SERVICE_NAMESPACE.svc + insecureSkipVerify: false + ca: + secret: + name: metrics-server-cert + key: ca.crt + cert: + secret: + name: metrics-server-cert + key: tls.crt + keySecret: + name: metrics-server-cert + key: tls.key diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml new file mode 100644 index 000000000..2b91bfe9c --- /dev/null +++ b/config/rbac/kustomization.yaml @@ -0,0 +1,28 @@ +resources: +# All RBAC will be applied under this service account in +# the deployment namespace. You may comment out this resource +# if your manager will use a service account that exists at +# runtime. Be sure to update RoleBinding and ClusterRoleBinding +# subjects if changing service account names. +- service_account.yaml +- role.yaml +- role_binding.yaml +- leader_election_role.yaml +- leader_election_role_binding.yaml +# The following RBAC configurations are used to protect +# the metrics endpoint with authn/authz. These configurations +# ensure that only authorized users and service accounts +# can access the metrics endpoint. Comment the following +# permissions if you want to disable this protection. +# More info: https://book.kubebuilder.io/reference/metrics.html +- metrics_auth_role.yaml +- metrics_auth_role_binding.yaml +- metrics_reader_role.yaml +# For each CRD, "Admin", "Editor" and "Viewer" roles are scaffolded by +# default, aiding admins in cluster management. Those roles are +# not used by the {{ .ProjectName }} itself. You can comment the following lines +# if you do not want those helpers be installed with your Project. +- variantautoscaling_admin_role.yaml +- variantautoscaling_editor_role.yaml +- variantautoscaling_viewer_role.yaml + diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml new file mode 100644 index 000000000..567dfb985 --- /dev/null +++ b/config/rbac/leader_election_role.yaml @@ -0,0 +1,40 @@ +# permissions to do leader election. +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: leader-election-role +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml new file mode 100644 index 000000000..cda7c0877 --- /dev/null +++ b/config/rbac/leader_election_role_binding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: leader-election-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: leader-election-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml new file mode 100644 index 000000000..32d2e4ec6 --- /dev/null +++ b/config/rbac/metrics_auth_role.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metrics-auth-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml new file mode 100644 index 000000000..e775d67ff --- /dev/null +++ b/config/rbac/metrics_auth_role_binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: metrics-auth-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: metrics-auth-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml new file mode 100644 index 000000000..51a75db47 --- /dev/null +++ b/config/rbac/metrics_reader_role.yaml @@ -0,0 +1,9 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metrics-reader +rules: +- nonResourceURLs: + - "/metrics" + verbs: + - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml new file mode 100644 index 000000000..24ccf78e4 --- /dev/null +++ b/config/rbac/role.yaml @@ -0,0 +1,62 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: manager-role +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - update + - watch +- apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - apps + resources: + - deployments + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - llmd.ai + resources: + - variantautoscalings + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - llmd.ai + resources: + - variantautoscalings/finalizers + verbs: + - update +- apiGroups: + - llmd.ai + resources: + - variantautoscalings/status + verbs: + - get + - patch + - update diff --git a/config/rbac/role_binding.yaml b/config/rbac/role_binding.yaml new file mode 100644 index 000000000..81f9f9dd2 --- /dev/null +++ b/config/rbac/role_binding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: manager-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/service_account.yaml b/config/rbac/service_account.yaml new file mode 100644 index 000000000..2a2d1fd94 --- /dev/null +++ b/config/rbac/service_account.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: controller-manager + namespace: system diff --git a/config/rbac/variantautoscaling_admin_role.yaml b/config/rbac/variantautoscaling_admin_role.yaml new file mode 100644 index 000000000..a9915a422 --- /dev/null +++ b/config/rbac/variantautoscaling_admin_role.yaml @@ -0,0 +1,27 @@ +# This rule is not used by the project inferno-autoscaler itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants full permissions ('*') over llmd.llm-d.ai. +# This role is intended for users authorized to modify roles and bindings within the cluster, +# enabling them to delegate specific permissions to other users or groups as needed. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: variantautoscaling-admin-role +rules: +- apiGroups: + - llmd.llm-d.ai + resources: + - VariantAutoscalingss + verbs: + - '*' +- apiGroups: + - llmd.llm-d.ai + resources: + - variantautoscalings/status + verbs: + - get diff --git a/config/rbac/variantautoscaling_editor_role.yaml b/config/rbac/variantautoscaling_editor_role.yaml new file mode 100644 index 000000000..aaa3be846 --- /dev/null +++ b/config/rbac/variantautoscaling_editor_role.yaml @@ -0,0 +1,33 @@ +# This rule is not used by the project inferno-autoscaler itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants permissions to create, update, and delete resources within the llmd.llm-d.ai. +# This role is intended for users who need to manage these resources +# but should not control RBAC or manage permissions for others. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: variantautoscaling-editor-role +rules: +- apiGroups: + - llmd.llm-d.ai + resources: + - variantautoscalings + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - llmd.llm-d.ai + resources: + - VariantAutoscalingss/status + verbs: + - get diff --git a/config/rbac/variantautoscaling_viewer_role.yaml b/config/rbac/variantautoscaling_viewer_role.yaml new file mode 100644 index 000000000..36fedb89b --- /dev/null +++ b/config/rbac/variantautoscaling_viewer_role.yaml @@ -0,0 +1,29 @@ +# This rule is not used by the project inferno-autoscaler itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants read-only access to llmd.llm-d.ai resources. +# This role is intended for users who need visibility into these resources +# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: inferno-autoscaler + app.kubernetes.io/managed-by: kustomize + name: VariantAutoscalings-viewer-role +rules: +- apiGroups: + - llmd.llm-d.ai + resources: + - VariantAutoscalingss + verbs: + - get + - list + - watch +- apiGroups: + - llmd.llm-d.ai + resources: + - VariantAutoscalingss/status + verbs: + - get diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml new file mode 100644 index 000000000..79d0b2ae7 --- /dev/null +++ b/config/samples/kustomization.yaml @@ -0,0 +1,4 @@ +## Append samples of your project ## +resources: +- llmd_v1alpha1_VariantAutoscalings.yaml +# +kubebuilder:scaffold:manifestskustomizesamples diff --git a/demos/generators/main.go b/demos/generators/main.go deleted file mode 100644 index 7dfb19378..000000000 --- a/demos/generators/main.go +++ /dev/null @@ -1,153 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - - "github.com/llm-inferno/inferno/pkg/config" -) - -func main() { - // accelerator names - aNames := []string{"AIU2", "L4", "L40S", "MI210", "A100", "G2", "MI250", "H100", - "MI300X", "2xAIU2", "2xMI210", "2xA100", "2xG2", "2xMI250", "2xH100", "2xMI300X", - "4xAIU2", "4xMI210", "4xA100", "4xG2", "4xMI250"} - - // model names - mNames := []string{"granite_13b", "granite_20b", "granite_34b", "llama_7b", "llama_13b", - "llama3_8b", "llama_70b", "mistral_7b", "mixtral_8_7b"} - - // model specs - // 1D: models - // 2D: accelerators x models - - // memSize := []int{39, 60, 102, 21, 39, 72, 210, 21, 168} - - alpha := [][]float32{ - {205.80, 297.80, 487.70, 116.90, 201.30, 129.30, 274.10, 123.20, 296.30}, - {137.20, 198.53, 325.13, 77.93, 134.20, 86.20, 182.73, 82.13, 197.53}, - {47.86, 69.26, 113.42, 27.19, 46.81, 30.07, 63.74, 28.65, 68.91}, - {25.10, 36.32, 59.48, 14.26, 24.55, 15.77, 33.43, 15.02, 36.13}, - {20.58, 29.78, 48.77, 11.69, 20.13, 12.93, 27.41, 12.32, 29.63}, - {17.15, 24.82, 40.64, 9.74, 16.78, 10.78, 22.84, 10.27, 24.69}, - {12.86, 18.61, 30.48, 7.31, 12.58, 8.08, 17.13, 7.70, 18.52}, - {12.25, 17.73, 29.03, 6.96, 11.98, 7.70, 16.32, 7.33, 17.64}, - {7.77, 11.24, 18.40, 4.41, 7.60, 4.88, 10.34, 4.65, 11.18}, - {147.00, 212.71, 348.36, 83.50, 143.79, 92.36, 195.79, 88.00, 211.64}, - {17.74, 25.67, 42.04, 10.08, 17.35, 11.15, 23.63, 10.62, 25.54}, - {14.60, 21.12, 34.59, 8.29, 14.28, 9.17, 19.44, 8.74, 21.01}, - {12.11, 17.52, 28.69, 6.88, 11.84, 7.61, 16.12, 7.25, 17.43}, - {9.11, 13.18, 21.58, 5.17, 8.91, 5.72, 12.13, 5.45, 13.11}, - {8.65, 12.51, 20.49, 4.91, 8.46, 5.43, 11.52, 5.18, 12.45}, - {5.49, 7.94, 13.01, 3.12, 5.37, 3.45, 7.31, 3.29, 7.90}, - {102.90, 148.90, 243.85, 58.45, 100.65, 64.65, 137.05, 61.60, 148.15}, - {12.55, 18.16, 29.74, 7.13, 12.27, 7.88, 16.71, 7.51, 18.07}, - {10.29, 14.89, 24.39, 5.85, 10.07, 6.47, 13.71, 6.16, 14.82}, - {8.58, 12.41, 20.32, 4.87, 8.39, 5.39, 11.42, 5.13, 12.35}, - {6.43, 9.31, 15.24, 3.65, 6.29, 4.04, 8.57, 3.85, 9.26}, - } - - beta := [][]float32{ - {4.10, 5.00, 8.20, 6.30, 8.70, 5.90, 70.70, 4.10, 60.10}, - {2.73, 3.33, 5.47, 4.20, 5.80, 3.93, 47.13, 2.73, 40.07}, - {0.95, 1.16, 1.91, 1.47, 2.02, 1.37, 16.44, 0.95, 13.98}, - {0.50, 0.61, 1.00, 0.77, 1.06, 0.72, 8.62, 0.50, 7.33}, - {0.41, 0.50, 0.82, 0.63, 0.87, 0.59, 7.07, 0.41, 6.01}, - {0.34, 0.42, 0.68, 0.53, 0.73, 0.49, 5.89, 0.34, 5.01}, - {0.26, 0.31, 0.51, 0.39, 0.54, 0.37, 4.42, 0.26, 3.76}, - {0.24, 0.30, 0.49, 0.38, 0.52, 0.35, 4.21, 0.24, 3.58}, - {0.15, 0.19, 0.31, 0.24, 0.33, 0.22, 2.67, 0.15, 2.27}, - {2.93, 3.57, 5.86, 4.50, 6.21, 4.21, 50.50, 2.93, 42.93}, - {0.35, 0.43, 0.71, 0.54, 0.75, 0.51, 6.09, 0.35, 5.18}, - {0.29, 0.35, 0.58, 0.45, 0.62, 0.42, 5.01, 0.29, 4.26}, - {0.24, 0.29, 0.48, 0.37, 0.51, 0.35, 4.16, 0.24, 3.54}, - {0.18, 0.22, 0.36, 0.28, 0.38, 0.26, 3.13, 0.18, 2.66}, - {0.17, 0.21, 0.34, 0.26, 0.37, 0.25, 2.97, 0.17, 2.53}, - {0.11, 0.13, 0.22, 0.17, 0.23, 0.16, 1.89, 0.11, 1.60}, - {2.05, 2.50, 4.10, 3.15, 4.35, 2.95, 35.35, 2.05, 30.05}, - {0.25, 0.30, 0.50, 0.38, 0.53, 0.36, 4.31, 0.25, 3.66}, - {0.21, 0.25, 0.41, 0.32, 0.44, 0.30, 3.54, 0.21, 3.01}, - {0.17, 0.21, 0.34, 0.26, 0.36, 0.25, 2.95, 0.17, 2.50}, - {0.13, 0.16, 0.26, 0.20, 0.27, 0.18, 2.21, 0.13, 1.88}, - } - - maxBatchSize := [][]int{ - {51, 38, 19, 102, 51, 25, 8, 102, 12}, - {9, 7, 3, 19, 9, 4, 1, 19, 2}, - {19, 14, 7, 38, 19, 9, 3, 38, 4}, - {25, 19, 9, 51, 25, 12, 4, 51, 6}, - {32, 24, 12, 64, 32, 16, 5, 64, 8}, - {38, 28, 14, 76, 38, 19, 6, 76, 9}, - {51, 38, 19, 102, 51, 25, 8, 102, 12}, - {32, 24, 12, 64, 32, 16, 5, 64, 8}, - {76, 57, 28, 153, 76, 38, 12, 153, 19}, - {102, 76, 38, 204, 102, 51, 16, 204, 25}, - {51, 38, 19, 102, 51, 25, 8, 102, 12}, - {64, 48, 24, 128, 64, 32, 10, 128, 16}, - {76, 57, 28, 153, 76, 38, 12, 153, 19}, - {102, 76, 38, 204, 102, 51, 16, 204, 25}, - {64, 48, 24, 128, 64, 32, 10, 128, 16}, - {153, 115, 57, 307, 153, 76, 24, 307, 38}, - {204, 153, 76, 409, 204, 102, 32, 409, 51}, - {102, 76, 38, 204, 102, 51, 16, 204, 25}, - {128, 96, 48, 256, 128, 64, 20, 256, 32}, - {153, 115, 57, 307, 153, 76, 24, 307, 38}, - {204, 153, 76, 409, 204, 102, 32, 409, 51}, - } - - count := [][]int{ - {1, 1, 1, 1, 1, 1, 2, 1, 2}, - {2, 4, 4, 1, 2, 4, 8, 1, 8}, - {1, 2, 4, 1, 1, 2, 4, 1, 4}, - {1, 1, 2, 1, 1, 2, 4, 1, 4}, - {1, 1, 2, 1, 1, 1, 4, 1, 4}, - {1, 1, 2, 1, 1, 1, 4, 1, 2}, - {1, 1, 1, 1, 1, 1, 2, 1, 2}, - {1, 1, 2, 1, 1, 1, 4, 1, 4}, - {1, 1, 1, 1, 1, 1, 2, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 2, 1, 2}, - {1, 1, 1, 1, 1, 1, 2, 1, 2}, - {1, 1, 1, 1, 1, 1, 2, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 2, 1, 2}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - } - - atTokens := 512 - - // create data structures - numAcc := len(aNames) - numModels := len(mNames) - models := config.ModelData{ - PerfData: make([]config.ModelAcceleratorPerfData, numModels*numAcc), - } - k := 0 - for i, n := range mNames { - for j, a := range aNames { - pd := config.ModelAcceleratorPerfData{ - Name: n, - Acc: a, - AccCount: count[j][i], - Alpha: alpha[j][i], - Beta: beta[j][i], - MaxBatchSize: maxBatchSize[j][i], - AtTokens: atTokens, - } - models.PerfData[k] = pd - k++ - } - } - - // generate json - if byteValue, err := json.Marshal(models); err != nil { - fmt.Println(err.Error()) - } else { - fmt.Println(string(byteValue)) - } -} diff --git a/demos/main/main.go b/demos/main/main.go deleted file mode 100644 index 9a4897482..000000000 --- a/demos/main/main.go +++ /dev/null @@ -1,116 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "os" - - "github.com/llm-inferno/inferno/pkg/config" - "github.com/llm-inferno/inferno/pkg/core" - "github.com/llm-inferno/inferno/pkg/manager" - "github.com/llm-inferno/inferno/pkg/solver" - "github.com/llm-inferno/inferno/pkg/utils" -) - -func main() { - size := "large" - if len(os.Args) > 1 { - size = os.Args[1] - } - prefix := "../../sample-data/" + size + "/" - fn_acc := prefix + "accelerator-data.json" - fn_cap := prefix + "capacity-data.json" - fn_mod := prefix + "model-data.json" - fn_svc := prefix + "serviceclass-data.json" - fn_srv := prefix + "server-data.json" - fn_opt := prefix + "optimizer-data.json" - fn_sol := prefix + "solution-data.json" - - system := core.NewSystem() - - bytes_acc, err_acc := os.ReadFile(fn_acc) - if err_acc != nil { - fmt.Println(err_acc) - } - if d, err := utils.FromDataToSpec(bytes_acc, config.AcceleratorData{}); err == nil { - system.SetAcceleratorsFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_cap, err_cap := os.ReadFile(fn_cap) - if err_cap != nil { - fmt.Println(err_cap) - } - if d, err := utils.FromDataToSpec(bytes_cap, config.CapacityData{}); err == nil { - system.SetCapacityFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_mod, err_mod := os.ReadFile(fn_mod) - if err_mod != nil { - fmt.Println(err_mod) - } - if d, err := utils.FromDataToSpec(bytes_mod, config.ModelData{}); err == nil { - system.SetModelsFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_svc, err_svc := os.ReadFile(fn_svc) - if err_svc != nil { - fmt.Println(err_svc) - } - if d, err := utils.FromDataToSpec(bytes_svc, config.ServiceClassData{}); err == nil { - system.SetServiceClassesFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_srv, err_srv := os.ReadFile(fn_srv) - if err_srv != nil { - fmt.Println(err_srv) - } - if d, err := utils.FromDataToSpec(bytes_srv, config.ServerData{}); err == nil { - system.SetServersFromSpec(d) - } else { - fmt.Println(err) - return - } - - var optimizer *solver.Optimizer - bytes_opt, err_opt := os.ReadFile(fn_opt) - if err_opt != nil { - fmt.Println(err_acc) - } - if d, err := utils.FromDataToSpec(bytes_opt, config.OptimizerData{}); err == nil { - optimizer = solver.NewOptimizerFromSpec(&d.Spec) - } else { - fmt.Println(err) - return - } - - manager := manager.NewManager(system, optimizer) - - system.Calculate() - if err := manager.Optimize(); err != nil { - fmt.Println(err) - return - } - allocationSolution := system.GenerateSolution() - - // generate json - if byteValue, err := json.Marshal(allocationSolution); err != nil { - fmt.Println(err) - } else { - os.WriteFile(fn_sol, byteValue, 0644) - } - - fmt.Printf("%v", system) - fmt.Printf("%v", optimizer) -} diff --git a/demos/scale/main.go b/demos/scale/main.go deleted file mode 100644 index 39c7351a8..000000000 --- a/demos/scale/main.go +++ /dev/null @@ -1,143 +0,0 @@ -package main - -import ( - "fmt" - "os" - - "github.com/llm-inferno/inferno/pkg/config" - "github.com/llm-inferno/inferno/pkg/core" - "github.com/llm-inferno/inferno/pkg/manager" - "github.com/llm-inferno/inferno/pkg/solver" - "github.com/llm-inferno/inferno/pkg/utils" -) - -func main() { - size := "large" - if len(os.Args) > 1 { - size = os.Args[1] - } - prefix := "../../sample-data/" + size + "/" - fn_acc := prefix + "accelerator-data.json" - fn_cap := prefix + "capacity-data.json" - fn_mod := prefix + "model-data.json" - fn_svc := prefix + "serviceclass-data.json" - fn_srv := prefix + "server-data.json" - fn_opt := prefix + "optimizer-data.json" - - system := core.NewSystem() - - bytes_acc, err_acc := os.ReadFile(fn_acc) - if err_acc != nil { - fmt.Println(err_acc) - } - if d, err := utils.FromDataToSpec(bytes_acc, config.AcceleratorData{}); err == nil { - system.SetAcceleratorsFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_cap, err_cap := os.ReadFile(fn_cap) - if err_cap != nil { - fmt.Println(err_cap) - } - if d, err := utils.FromDataToSpec(bytes_cap, config.CapacityData{}); err == nil { - system.SetCapacityFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_mod, err_mod := os.ReadFile(fn_mod) - if err_mod != nil { - fmt.Println(err_mod) - } - if d, err := utils.FromDataToSpec(bytes_mod, config.ModelData{}); err == nil { - system.SetModelsFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_svc, err_svc := os.ReadFile(fn_svc) - if err_svc != nil { - fmt.Println(err_svc) - } - if d, err := utils.FromDataToSpec(bytes_svc, config.ServiceClassData{}); err == nil { - system.SetServiceClassesFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_srv, err_srv := os.ReadFile(fn_srv) - if err_srv != nil { - fmt.Println(err_srv) - } - if d, err := utils.FromDataToSpec(bytes_srv, config.ServerData{}); err == nil { - system.SetServersFromSpec(d) - } else { - fmt.Println(err) - return - } - - var optimizer *solver.Optimizer - bytes_opt, err_opt := os.ReadFile(fn_opt) - if err_opt != nil { - fmt.Println(err_acc) - } - if d, err := utils.FromDataToSpec(bytes_opt, config.OptimizerData{}); err == nil { - optimizer = solver.NewOptimizerFromSpec(&d.Spec) - } else { - fmt.Println(err) - return - } - - manager := manager.NewManager(system, optimizer) - - system.Calculate() - if err := manager.Optimize(); err != nil { - fmt.Println(err) - return - } - - serverName := "Premium-llama3_8b" - - server := system.Server(serverName) - if server == nil { - fmt.Printf("No server %s\n", serverName) - return - } - allocBefore := server.Allocation() - if allocBefore == nil { - fmt.Printf("No allocation for server %s \n", serverName) - return - } - // change load on server - load := server.Load() - if load == nil { - fmt.Printf("No model load data for server %s \n", serverName) - return - } - fmt.Println("AllocBefore: ", allocBefore) - newArv := load.ArrivalRate * 2.5 - newLength := int(float32(load.AvgLength) * 1.5) - newLoad := config.ServerLoadSpec{ - ArrivalRate: newArv, - AvgLength: newLength, - ArrivalCOV: load.ArrivalCOV, - ServiceCOV: load.ServiceCOV, - } - server.SetLoad(&newLoad) - - // scale allocation - allocAfter, inc := allocBefore.Scale(serverName) - fmt.Println("AllocAfter: ", allocAfter) - fmt.Println("Inc: ", inc) - - // reallocate - var gName string - allocAfter, gName = allocBefore.ReAllocate(serverName) - fmt.Println("AllocAfter: ", allocAfter) - fmt.Println("gName: ", gName) -} diff --git a/demos/transition/main.go b/demos/transition/main.go deleted file mode 100644 index e414704a9..000000000 --- a/demos/transition/main.go +++ /dev/null @@ -1,151 +0,0 @@ -package main - -import ( - "fmt" - "math" - "math/rand/v2" - "os" - - "github.com/llm-inferno/inferno/pkg/config" - "github.com/llm-inferno/inferno/pkg/core" - "github.com/llm-inferno/inferno/pkg/manager" - "github.com/llm-inferno/inferno/pkg/solver" - "github.com/llm-inferno/inferno/pkg/utils" -) - -func main() { - size := "large" - if len(os.Args) > 1 { - size = os.Args[1] - } - prefix := "../../sample-data/" + size + "/" - fn_acc := prefix + "accelerator-data.json" - fn_cap := prefix + "capacity-data.json" - fn_mod := prefix + "model-data.json" - fn_svc := prefix + "serviceclass-data.json" - fn_srv := prefix + "server-data.json" - fn_opt := prefix + "optimizer-data.json" - - system := core.NewSystem() - - bytes_acc, err_acc := os.ReadFile(fn_acc) - if err_acc != nil { - fmt.Println(err_acc) - } - if d, err := utils.FromDataToSpec(bytes_acc, config.AcceleratorData{}); err == nil { - system.SetAcceleratorsFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_cap, err_cap := os.ReadFile(fn_cap) - if err_cap != nil { - fmt.Println(err_cap) - } - if d, err := utils.FromDataToSpec(bytes_cap, config.CapacityData{}); err == nil { - system.SetCapacityFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_mod, err_mod := os.ReadFile(fn_mod) - if err_mod != nil { - fmt.Println(err_mod) - } - if d, err := utils.FromDataToSpec(bytes_mod, config.ModelData{}); err == nil { - system.SetModelsFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_svc, err_svc := os.ReadFile(fn_svc) - if err_svc != nil { - fmt.Println(err_svc) - } - if d, err := utils.FromDataToSpec(bytes_svc, config.ServiceClassData{}); err == nil { - system.SetServiceClassesFromSpec(d) - } else { - fmt.Println(err) - return - } - - bytes_srv, err_srv := os.ReadFile(fn_srv) - if err_srv != nil { - fmt.Println(err_srv) - } - if d, err := utils.FromDataToSpec(bytes_srv, config.ServerData{}); err == nil { - system.SetServersFromSpec(d) - } else { - fmt.Println(err) - return - } - - var optimizer *solver.Optimizer - bytes_opt, err_opt := os.ReadFile(fn_opt) - if err_opt != nil { - fmt.Println(err_acc) - } - if d, err := utils.FromDataToSpec(bytes_opt, config.OptimizerData{}); err == nil { - optimizer = solver.NewOptimizerFromSpec(&d.Spec) - } else { - fmt.Println(err) - return - } - - manager := manager.NewManager(system, optimizer) - - system.Calculate() - if err := manager.Optimize(); err != nil { - fmt.Println(err) - return - } - - fmt.Printf("%v", system) - fmt.Printf("%v", optimizer) - - // generate random values in [alpha, 2 - alpha), where 0 < alpha < 1 - alpha := float32(0.1) - - for _, server := range system.Servers() { - load := server.Load() - if load == nil { - continue - } - - factorA := 2 * (rand.Float32() - 0.5) * (1 - alpha) - newArv := load.ArrivalRate * (1 + factorA) - if newArv <= 0 { - newArv = 1 - } - - factorB := 2 * (rand.Float32() - 0.5) * (1 - alpha) - newLength := int(math.Ceil(float64(float32(load.AvgLength) * (1 + factorB)))) - if newLength <= 0 { - newLength = 1 - } - newLoad := config.ServerLoadSpec{ - ArrivalRate: newArv, - AvgLength: newLength, - ArrivalCOV: load.ArrivalCOV, - ServiceCOV: load.ServiceCOV, - } - server.SetLoad(&newLoad) - if curAllocation := server.CurAllocation(); curAllocation != nil { - server.SetCurAllocation(server.Allocation().Clone()) - } - - // fmt.Printf("s=%s, rate=%v, tokens=%d \n", - // server.Name(), load.ArrivalRate, load.AvgLength) - } - - system.Calculate() - if err := manager.Optimize(); err != nil { - fmt.Println(err) - return - } - fmt.Printf("%v", system) - fmt.Printf("%v", optimizer) -} diff --git a/deploy/configmap-accelerator-unitcost.yaml b/deploy/configmap-accelerator-unitcost.yaml new file mode 100644 index 000000000..f15e60193 --- /dev/null +++ b/deploy/configmap-accelerator-unitcost.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: accelerator-unit-costs + namespace: default +data: + A100: "1.23" + L4: "0.42" + H100: "2.50" \ No newline at end of file diff --git a/deploy/configmap-serviceclass.yaml b/deploy/configmap-serviceclass.yaml new file mode 100644 index 000000000..0aab60cba --- /dev/null +++ b/deploy/configmap-serviceclass.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: service-classes-config + namespace: default +data: + premium.yaml: | + name: Premium + priority: 1 + data: + - model: default + slo-itl: 40 + slo-ttw: 500 + - model: llama0-70b + slo-itl: 80 + slo-ttw: 500 + freemium.yaml: | + name: Freemium + priority: 10 + data: + - model: granite-13b + slo-itl: 200 + slo-ttw: 2000 + - model: llama0-7b + slo-itl: 150 + slo-ttw: 1500 diff --git a/deploy/local-cluster.sh b/deploy/local-cluster.sh new file mode 100755 index 000000000..990ca5eee --- /dev/null +++ b/deploy/local-cluster.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +set -euo pipefail + +cluster_name="a100-cluster" +control_plane_node="${cluster_name}-control-plane" +worker1_node="${cluster_name}-worker" +worker2_node="${cluster_name}-worker2" + +echo "[1/3] Creating Kind cluster: ${cluster_name}..." + +cat < kind-config.yaml +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane +- role: worker +- role: worker +EOF + +kind create cluster --name "${cluster_name}" --config kind-config.yaml + +echo "[2/3] Waiting for node ${control_plane_node} to be ready..." +while [[ $(kubectl get nodes "${control_plane_node}" --no-headers 2>/dev/null | awk '{print $2}') != "Ready" ]]; do + sleep 1 +done + +echo "[3/3] Patching node ${control_plane_node} with GPU annotation and capacity..." +cat < /dev/null 2>&1 & +proxy_pid=$! +sleep 2 # Give proxy a moment to start + +echo "Starting background proxy connection (pid=${proxy_pid})..." + curl 127.0.0.1:8001 > /dev/null 2>&1 + if [[ ! $? -eq 0 ]]; then + echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting." + exit 1 + else + echo "Connected to the kubelet for patching the nodes" + fi + +# Patch nodes + for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name") + do + echo "- Patching node (add): ${node_name}" + if [[ "${node_name}" == "${worker1_node}" ]]; then + resource_name="amd.com~1gpu" + resource_count="6" + elif [[ "${node_name}" == "${worker2_node}" ]]; then + resource_name="intel.com~1gpu" + resource_count="4" + else + resource_name="nvidia.com~1gpu" + resource_count="8" + fi + + curl --header "Content-Type: application/json-patch+json" \ + --request PATCH \ + --data '[{"op":"add","path":"/status/capacity/'${resource_name}'","value":"'${resource_count}'"}]' \ + http://localhost:8001/api/v1/nodes/${node_name}/status + done + +echo "[5/5] Cleaning up..." +kill -9 ${proxy_pid} + +echo "🎉 Done: Nodes have GPU annotations, capacities, and allocatables set." diff --git a/deploy/single-node-gpu-cluster.sh b/deploy/single-node-gpu-cluster.sh new file mode 100644 index 000000000..de4143bd4 --- /dev/null +++ b/deploy/single-node-gpu-cluster.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +set -e +set -o pipefail + +GPU_OPERATOR_NS=gpu-operator + +echo "> Creating Kind cluster" +kind create cluster --config - < Deploying cert manager" +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.15.3/cert-manager.yaml + +echo "> Creating symlink in the control-plane container" +docker exec -ti kind-control-plane ln -s /sbin/ldconfig /sbin/ldconfig.real + +echo "> Unmounting the nvidia devices in the control-plane container" +docker exec -ti kind-control-plane umount -R /proc/driver/nvidia + +# According to https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html +echo "> Adding/updateding the NVIDIA Helm repository" +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update + +echo "> Installing the GPU Operator Helm chart" +helm upgrade --install --wait gpu-operator -n ${GPU_OPERATOR_NS} --create-namespace nvidia/gpu-operator \ + --set mig.strategy=mixed \ + --set cdi.enabled=true \ + --set migManager.enabled=false \ + --set migManager.config.default="" + +echo "> Waiting for container toolkit daemonset to be created" +timeout 60s bash -c "until kubectl get daemonset nvidia-container-toolkit-daemonset -o name -n ${GPU_OPERATOR_NS}; do sleep 10; done" + +echo "> Waiting for container toolkit daemonset to become ready" +kubectl rollout status daemonset nvidia-container-toolkit-daemonset -n ${GPU_OPERATOR_NS} + +echo "> Waiting for device plugin daemonset to be created" +timeout 60s bash -c "until kubectl get daemonset nvidia-device-plugin-daemonset -o name -n ${GPU_OPERATOR_NS}; do sleep 10; done" + +echo "> Waiting for device plugin daemonset to become ready" +kubectl rollout status daemonset nvidia-device-plugin-daemonset -n ${GPU_OPERATOR_NS} \ No newline at end of file diff --git a/deploy/ticker-configmap.yaml b/deploy/ticker-configmap.yaml new file mode 100644 index 000000000..8cae2e5f3 --- /dev/null +++ b/deploy/ticker-configmap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: inferno-variantautoscaling-config + namespace: default +data: + GLOBAL_OPT_INTERVAL: "1m" + GLOBAL_OPT_TRIGGER: "false" \ No newline at end of file diff --git a/docs/arch/high-level.png b/docs/arch/high-level.png deleted file mode 100644 index a48690fdb..000000000 Binary files a/docs/arch/high-level.png and /dev/null differ diff --git a/docs/arch/runtime.png b/docs/arch/runtime.png deleted file mode 100644 index 9765234e8..000000000 Binary files a/docs/arch/runtime.png and /dev/null differ diff --git a/docs/diagrams/inferno-WVA-design.png b/docs/diagrams/inferno-WVA-design.png new file mode 100644 index 000000000..bdaee8005 Binary files /dev/null and b/docs/diagrams/inferno-WVA-design.png differ diff --git a/docs/figs/Slide10.png b/docs/figs/Slide10.png deleted file mode 100644 index 025f4cd71..000000000 Binary files a/docs/figs/Slide10.png and /dev/null differ diff --git a/docs/figs/Slide11.png b/docs/figs/Slide11.png deleted file mode 100644 index 9f2733f01..000000000 Binary files a/docs/figs/Slide11.png and /dev/null differ diff --git a/docs/figs/Slide12.png b/docs/figs/Slide12.png deleted file mode 100644 index 7cb35c96f..000000000 Binary files a/docs/figs/Slide12.png and /dev/null differ diff --git a/docs/figs/Slide13.png b/docs/figs/Slide13.png deleted file mode 100644 index 93ebbeee0..000000000 Binary files a/docs/figs/Slide13.png and /dev/null differ diff --git a/docs/figs/Slide14.png b/docs/figs/Slide14.png deleted file mode 100644 index dd51c72c2..000000000 Binary files a/docs/figs/Slide14.png and /dev/null differ diff --git a/docs/figs/Slide16.png b/docs/figs/Slide16.png deleted file mode 100644 index 65870f7a5..000000000 Binary files a/docs/figs/Slide16.png and /dev/null differ diff --git a/docs/figs/Slide17.png b/docs/figs/Slide17.png deleted file mode 100644 index 03ca01079..000000000 Binary files a/docs/figs/Slide17.png and /dev/null differ diff --git a/docs/figs/Slide19.png b/docs/figs/Slide19.png deleted file mode 100644 index 6c6876698..000000000 Binary files a/docs/figs/Slide19.png and /dev/null differ diff --git a/docs/figs/Slide2.png b/docs/figs/Slide2.png deleted file mode 100644 index 9765234e8..000000000 Binary files a/docs/figs/Slide2.png and /dev/null differ diff --git a/docs/figs/Slide20.png b/docs/figs/Slide20.png deleted file mode 100644 index 347353299..000000000 Binary files a/docs/figs/Slide20.png and /dev/null differ diff --git a/docs/figs/Slide21.png b/docs/figs/Slide21.png deleted file mode 100644 index c20f2af3f..000000000 Binary files a/docs/figs/Slide21.png and /dev/null differ diff --git a/docs/figs/Slide23.png b/docs/figs/Slide23.png deleted file mode 100644 index 7d811a957..000000000 Binary files a/docs/figs/Slide23.png and /dev/null differ diff --git a/docs/figs/Slide24.png b/docs/figs/Slide24.png deleted file mode 100644 index 3e0f3bd5f..000000000 Binary files a/docs/figs/Slide24.png and /dev/null differ diff --git a/docs/figs/Slide26.png b/docs/figs/Slide26.png deleted file mode 100644 index 6484162b3..000000000 Binary files a/docs/figs/Slide26.png and /dev/null differ diff --git a/docs/figs/Slide27.png b/docs/figs/Slide27.png deleted file mode 100644 index c1a03740a..000000000 Binary files a/docs/figs/Slide27.png and /dev/null differ diff --git a/docs/figs/Slide28.png b/docs/figs/Slide28.png deleted file mode 100644 index 5ebc368ed..000000000 Binary files a/docs/figs/Slide28.png and /dev/null differ diff --git a/docs/figs/Slide3.png b/docs/figs/Slide3.png deleted file mode 100644 index 3108948a4..000000000 Binary files a/docs/figs/Slide3.png and /dev/null differ diff --git a/docs/figs/Slide30.png b/docs/figs/Slide30.png deleted file mode 100644 index e94830454..000000000 Binary files a/docs/figs/Slide30.png and /dev/null differ diff --git a/docs/figs/Slide32.png b/docs/figs/Slide32.png deleted file mode 100644 index ff7ac4f3d..000000000 Binary files a/docs/figs/Slide32.png and /dev/null differ diff --git a/docs/figs/Slide33.png b/docs/figs/Slide33.png deleted file mode 100644 index 687215361..000000000 Binary files a/docs/figs/Slide33.png and /dev/null differ diff --git a/docs/figs/Slide34.png b/docs/figs/Slide34.png deleted file mode 100644 index f73bd0cbb..000000000 Binary files a/docs/figs/Slide34.png and /dev/null differ diff --git a/docs/figs/Slide4.png b/docs/figs/Slide4.png deleted file mode 100644 index 679875245..000000000 Binary files a/docs/figs/Slide4.png and /dev/null differ diff --git a/docs/figs/Slide5.png b/docs/figs/Slide5.png deleted file mode 100644 index 344b9f3fa..000000000 Binary files a/docs/figs/Slide5.png and /dev/null differ diff --git a/docs/figs/Slide6.png b/docs/figs/Slide6.png deleted file mode 100644 index 05be4b5da..000000000 Binary files a/docs/figs/Slide6.png and /dev/null differ diff --git a/docs/figs/Slide7.png b/docs/figs/Slide7.png deleted file mode 100644 index 9bc1aa5c2..000000000 Binary files a/docs/figs/Slide7.png and /dev/null differ diff --git a/docs/figs/Slide8.png b/docs/figs/Slide8.png deleted file mode 100644 index 9fff95fa9..000000000 Binary files a/docs/figs/Slide8.png and /dev/null differ diff --git a/docs/figs/Slide9.png b/docs/figs/Slide9.png deleted file mode 100644 index 65f5f1399..000000000 Binary files a/docs/figs/Slide9.png and /dev/null differ diff --git a/docs/figs/slide22.png b/docs/figs/slide22.png deleted file mode 100644 index a0d947bd5..000000000 Binary files a/docs/figs/slide22.png and /dev/null differ diff --git a/docs/slides/inferno-dynamic.pdf b/docs/slides/inferno-dynamic.pdf deleted file mode 100644 index 9d0a7939b..000000000 Binary files a/docs/slides/inferno-dynamic.pdf and /dev/null differ diff --git a/docs/slides/summary.pdf b/docs/slides/summary.pdf deleted file mode 100644 index 32a4eb848..000000000 Binary files a/docs/slides/summary.pdf and /dev/null differ diff --git a/go.mod b/go.mod index fe5898810..11e893c66 100644 --- a/go.mod +++ b/go.mod @@ -1,75 +1,100 @@ -module github.com/llm-inferno/inferno +module github.com/llm-d-incubation/inferno-autoscaler go 1.23.0 +godebug default=go1.23 + require ( - github.com/gin-gonic/gin v1.10.0 - github.com/llm-inferno/lpsolve v0.0.0-20250602153134-23b56773e87c - github.com/llm-inferno/queue-analysis v0.0.0-20250602150849-402e8a06efa7 - github.com/prometheus/client_golang v1.21.1 - github.com/prometheus/common v0.63.0 - k8s.io/api v0.32.3 - k8s.io/apimachinery v0.32.3 - k8s.io/client-go v0.32.3 + github.com/onsi/ginkgo/v2 v2.22.0 + github.com/onsi/gomega v1.36.1 + k8s.io/apimachinery v0.32.1 + k8s.io/client-go v0.32.1 + sigs.k8s.io/controller-runtime v0.20.4 ) require ( - github.com/bytedance/sonic v1.13.1 // indirect - github.com/bytedance/sonic/loader v0.2.4 // indirect - github.com/cloudwego/base64x v0.1.5 // indirect + cel.dev/expr v0.18.0 // indirect + github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/draffensperger/golp v0.0.0-20241201023928-94a60bf898d2 // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect - github.com/gabriel-vasile/mimetype v1.4.8 // indirect - github.com/gin-contrib/sse v1.0.0 // indirect github.com/go-logr/logr v1.4.2 // indirect - github.com/go-openapi/jsonpointer v0.21.1 // indirect - github.com/go-openapi/jsonreference v0.21.0 // indirect - github.com/go-openapi/swag v0.23.1 // indirect - github.com/go-playground/locales v0.14.1 // indirect - github.com/go-playground/universal-translator v0.18.1 // indirect - github.com/go-playground/validator/v10 v10.25.0 // indirect - github.com/goccy/go-json v0.10.5 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/cel-go v0.22.0 // indirect + github.com/google/gnostic-models v0.6.8 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/gofuzz v1.2.0 // indirect + github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/cpuid/v2 v2.2.10 // indirect - github.com/leodido/go-urn v1.4.0 // indirect - github.com/mailru/easyjson v0.9.0 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.1 // indirect - github.com/spf13/pflag v1.0.6 // indirect - github.com/twitchyliquid64/golang-asm v0.15.1 // indirect - github.com/ugorji/go/codec v1.2.12 // indirect + github.com/prometheus/common v0.62.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect + github.com/spf13/cobra v1.8.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/arch v0.15.0 // indirect - golang.org/x/crypto v0.36.0 // indirect - golang.org/x/net v0.37.0 // indirect - golang.org/x/oauth2 v0.28.0 // indirect - golang.org/x/sys v0.31.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect - golang.org/x/time v0.11.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect + go.opentelemetry.io/otel v1.28.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect + go.opentelemetry.io/otel/metric v1.28.0 // indirect + go.opentelemetry.io/otel/sdk v1.28.0 // indirect + go.opentelemetry.io/otel/trace v1.28.0 // indirect + go.opentelemetry.io/proto/otlp v1.3.1 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect + golang.org/x/net v0.33.0 // indirect + golang.org/x/oauth2 v0.24.0 // indirect + golang.org/x/sync v0.10.0 // indirect + golang.org/x/sys v0.30.0 // indirect + golang.org/x/term v0.27.0 // indirect + golang.org/x/text v0.21.0 // indirect + golang.org/x/time v0.7.0 // indirect + golang.org/x/tools v0.26.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 // indirect + google.golang.org/grpc v1.65.0 // indirect google.golang.org/protobuf v1.36.5 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/api v0.32.1 // indirect + k8s.io/apiextensions-apiserver v0.32.1 // indirect + k8s.io/apiserver v0.32.1 // indirect + k8s.io/component-base v0.32.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250304201544-e5f78fe3ede9 // indirect - k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect - sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect + k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect + k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect + sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect + sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect sigs.k8s.io/yaml v1.4.0 // indirect ) diff --git a/go.sum b/go.sum index 047a487ee..ba50bd104 100644 --- a/go.sum +++ b/go.sum @@ -1,58 +1,65 @@ +cel.dev/expr v0.18.0 h1:CJ6drgk+Hf96lkLikr4rFf19WrU0BOWEihyZnI2TAzo= +cel.dev/expr v0.18.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= +github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= +github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bytedance/sonic v1.13.1 h1:Jyd5CIvdFnkOWuKXr+wm4Nyk2h0yAFsr8ucJgEasO3g= -github.com/bytedance/sonic v1.13.1/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1+KgkJhz4= -github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= -github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY= -github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4= -github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= -github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/draffensperger/golp v0.0.0-20241201023928-94a60bf898d2 h1:y10u52tZ6hYiCjExVmaE8fnkhxrCpiTZvNZc7ZIfQgo= -github.com/draffensperger/golp v0.0.0-20241201023928-94a60bf898d2/go.mod h1:/TbDI9zua4CTUs81AOyDxnKAuvXX/SmOjonijHadP+k= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= -github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM= -github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8= -github.com/gin-contrib/sse v1.0.0 h1:y3bT1mUWUxDpW4JLQg/HnTqV4rozuW4tC9eFKTxYI9E= -github.com/gin-contrib/sse v1.0.0/go.mod h1:zNuFdwarAygJBht0NTKiSi3jRf6RbqeILZ9Sp6Slhe0= -github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU= -github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= -github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= -github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= -github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= -github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= -github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= -github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= -github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= -github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= -github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= -github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY= -github.com/go-playground/validator/v10 v10.25.0 h1:5Dh7cjvzR7BRZadnsVOzPhWsrwUr0nmsZJxEAnFLNO8= -github.com/go-playground/validator/v10 v10.25.0/go.mod h1:GGzBIJMuE98Ic/kJsBXbz1x/7cByt++cQ+YOuDM5wus= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= -github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/cel-go v0.22.0 h1:b3FJZxpiv1vTMo2/5RDUqAHPxkT8mmMfJIrq1llbf7g= +github.com/google/cel-go v0.22.0/go.mod h1:BuznPXXfQDpXKWQ9sPW3TzlAJN5zzFe+i9tIs0yC4s8= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -62,32 +69,25 @@ github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgY github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= -github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= -github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= -github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= -github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= -github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= -github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= -github.com/llm-inferno/lpsolve v0.0.0-20250602153134-23b56773e87c h1:MiVso5MxSHm6LRzoHl9adJrguC39srJfw9EokTkM3pI= -github.com/llm-inferno/lpsolve v0.0.0-20250602153134-23b56773e87c/go.mod h1:FzxR0cEKOAYgb2gO55vEv8SyCnDkAqGm+xZNQl2gb5E= -github.com/llm-inferno/queue-analysis v0.0.0-20250602150849-402e8a06efa7 h1:2qKjlbEYHeYEzUqN0NhPbRJlIKpst3JEQ1FHHVykOOU= -github.com/llm-inferno/queue-analysis v0.0.0-20250602150849-402e8a06efa7/go.mod h1:v/9Ae2WaDwn86zJDMCQxBADtT4nxmkyuwOzmkSypzfg= -github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= -github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= -github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= -github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -95,85 +95,117 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= -github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= -github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= -github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= -github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= -github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= +github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= +github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= +github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= +github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= -github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= -github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= +github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= -github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= +github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= -github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= -github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= -github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -golang.org/x/arch v0.15.0 h1:QtOrQd0bTUnhNVNndMpLHNWrDmYzZ2KDqSrEymqInZw= -golang.org/x/arch v0.15.0/go.mod h1:JmwW7aLIoRUKgaTzhkiEFxvcEiQGyOg9BMonBJUS7EE= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= +go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= +go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ= +go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q= +go.opentelemetry.io/otel/metric v1.28.0/go.mod h1:Fb1eVBFZmLVTMb6PPohq3TO9IIhUisDsbJoL/+uQW4s= +go.opentelemetry.io/otel/sdk v1.28.0 h1:b9d7hIry8yZsgtbmM0DKyPWMMUMlK9NEKuIG4aBqWyE= +go.opentelemetry.io/otel/sdk v1.28.0/go.mod h1:oYj7ClPUA7Iw3m+r7GeEjz0qckQRJK2B8zjcZEfu7Pg= +go.opentelemetry.io/otel/trace v1.28.0 h1:GhQ9cUuQGmNDd5BTCP2dAvv75RdMxEfTmYejp+lkx9g= +go.opentelemetry.io/otel/trace v1.28.0/go.mod h1:jPyXzNPg6da9+38HEwElrQiHlVMTnVfM3/yv2OlIHaI= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= -golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c= -golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= -golang.org/x/oauth2 v0.28.0 h1:CrgCKl8PPAVtLnU3c+EDw6x11699EWlsDeWNWKdIOkc= -golang.org/x/oauth2 v0.28.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= +golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= +golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= -golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= -golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= -golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= +golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= @@ -184,6 +216,16 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7 h1:YcyjlL1PRr2Q17/I0dPk2JmYS5CDXfcdb2Z3YRioEbw= +google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7/go.mod h1:OCdP9MfskevB/rbYvHTsXTtKC+3bHWajPdoKgjcYkfo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 h1:2035KHhUv+EpyB+hWgJnaWKJOdX1E95w2S8Rr4uWKTs= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= +google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -193,30 +235,34 @@ gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSP gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.32.3 h1:Hw7KqxRusq+6QSplE3NYG4MBxZw1BZnq4aP4cJVINls= -k8s.io/api v0.32.3/go.mod h1:2wEDTXADtm/HA7CCMD8D8bK4yuBUptzaRhYcYEEYA3k= -k8s.io/apimachinery v0.32.3 h1:JmDuDarhDmA/Li7j3aPrwhpNBA94Nvk5zLeOge9HH1U= -k8s.io/apimachinery v0.32.3/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= -k8s.io/client-go v0.32.3 h1:RKPVltzopkSgHS7aS98QdscAgtgah/+zmpAogooIqVU= -k8s.io/client-go v0.32.3/go.mod h1:3v0+3k4IcT9bXTc4V2rt+d2ZPPG700Xy6Oi0Gdl2PaY= +k8s.io/api v0.32.1 h1:f562zw9cy+GvXzXf0CKlVQ7yHJVYzLfL6JAS4kOAaOc= +k8s.io/api v0.32.1/go.mod h1:/Yi/BqkuueW1BgpoePYBRdDYfjPF5sgTr5+YqDZra5k= +k8s.io/apiextensions-apiserver v0.32.1 h1:hjkALhRUeCariC8DiVmb5jj0VjIc1N0DREP32+6UXZw= +k8s.io/apiextensions-apiserver v0.32.1/go.mod h1:sxWIGuGiYov7Io1fAS2X06NjMIk5CbRHc2StSmbaQto= +k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs= +k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/apiserver v0.32.1 h1:oo0OozRos66WFq87Zc5tclUX2r0mymoVHRq8JmR7Aak= +k8s.io/apiserver v0.32.1/go.mod h1:UcB9tWjBY7aryeI5zAgzVJB/6k7E97bkr1RgqDz0jPw= +k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU= +k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg= +k8s.io/component-base v0.32.1 h1:/5IfJ0dHIKBWysGV0yKTFfacZ5yNV1sulPh3ilJjRZk= +k8s.io/component-base v0.32.1/go.mod h1:j1iMMHi/sqAHeG5z+O9BFNCF698a1u0186zkjMZQ28w= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250304201544-e5f78fe3ede9 h1:t0huyHnz6HsokckRxAF1bY0cqPFwzINKCL7yltEjZQc= -k8s.io/kube-openapi v0.0.0-20250304201544-e5f78fe3ede9/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0= -k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= -sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= +k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= +k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcpeN4baWEV2ko2Z/AsiZgEdwgcfwLgMo= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= +sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU= +sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= +sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA= +sigs.k8s.io/structured-merge-diff/v4 v4.4.2/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt new file mode 100644 index 000000000..221dcbe0b --- /dev/null +++ b/hack/boilerplate.go.txt @@ -0,0 +1,15 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ \ No newline at end of file diff --git a/internal/actuator/README.md b/internal/actuator/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/internal/actuator/dummy_actuator.go b/internal/actuator/dummy_actuator.go new file mode 100644 index 000000000..0bbecf674 --- /dev/null +++ b/internal/actuator/dummy_actuator.go @@ -0,0 +1,53 @@ +package controller + +import ( + "context" + "fmt" + + llmdOptv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +type DummyActuator struct { + Client client.Client +} + +func NewDummyActuator(k8sClient client.Client) *DummyActuator { + return &DummyActuator{Client: k8sClient} +} + +func (a *DummyActuator) ApplyReplicaTargets(ctx context.Context, VariantAutoscalings *llmdOptv1alpha1.VariantAutoscaling) error { + logger := logf.FromContext(ctx) + desired := VariantAutoscalings.Status.DesiredOptimizedAlloc + + logger.Info("ApplyReplicaTargets - Model: %s, Accelerator: %s, TargetReplicas: %d\n", + VariantAutoscalings.Spec.ModelID, + desired.Accelerator, + desired.NumReplicas, + ) + + var deploy appsv1.Deployment + err := a.Client.Get(ctx, types.NamespacedName{ + Name: VariantAutoscalings.Name, + Namespace: VariantAutoscalings.Namespace, + }, &deploy) + if err != nil { + return fmt.Errorf("failed to get Deployment %s/%s: %w", VariantAutoscalings.Namespace, VariantAutoscalings.Name, err) + } + + // Patch replicas field + original := deploy.DeepCopy() + replicas := int32(desired.NumReplicas) + deploy.Spec.Replicas = &replicas + + patch := client.MergeFrom(original) + if err := a.Client.Patch(ctx, &deploy, patch); err != nil { + return fmt.Errorf("failed to patch Deployment %s: %w", deploy.Name, err) + } + + logger.Info("Patched Deployment %s to %d replicas\n", deploy.Name, replicas) + return nil +} diff --git a/internal/collector/Readme.md b/internal/collector/Readme.md new file mode 100644 index 000000000..e69de29bb diff --git a/internal/collector/collector.go b/internal/collector/collector.go new file mode 100644 index 000000000..44eb4b5d8 --- /dev/null +++ b/internal/collector/collector.go @@ -0,0 +1,143 @@ +package controller + +import ( + "context" + "fmt" + "math" + "strconv" + "time" + + "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" + promv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +type AcceleratorModelInfo struct { + Count int + Memory string +} + +// Collector holds the k8s client and discovers GPU inventory +var vendors = []string{ + "nvidia.com", + "amd.com", + "intel.com", +} + +const DEBUG = 4 + +// CollectInventory lists all Nodes and builds a map[nodeName][model]→info. +// It checks labels /gpu.product, /gpu.memory +// and capacity /gpu. +func CollectInventoryK8S(ctx context.Context, r client.Client) (map[string]map[string]AcceleratorModelInfo, error) { + logger := logf.FromContext(ctx) + + logger.Info("collecting inventory") + + var nodeList corev1.NodeList + if err := r.List(ctx, &nodeList); err != nil { + return nil, fmt.Errorf("failed to list nodes: %w", err) + } + + inv := make(map[string]map[string]AcceleratorModelInfo) + for _, node := range nodeList.Items { + nodeName := node.Name + for _, vendor := range vendors { + prodKey := vendor + "/gpu.product" + memKey := vendor + "/gpu.memory" + if model, ok := node.Labels[prodKey]; ok { + // found a GPU of this vendor + mem := node.Labels[memKey] + count := 0 + if cap, ok := node.Status.Capacity[corev1.ResourceName(vendor+"/gpu")]; ok { + count = int(cap.Value()) + } + if inv[nodeName] == nil { + inv[nodeName] = make(map[string]AcceleratorModelInfo) + } + inv[nodeName][model] = AcceleratorModelInfo{ + Count: count, + Memory: mem, + } + logger.V(DEBUG).Info("found inventory", "nodeName", nodeName, "model", model, "count", count, "mem", mem) + } + } + } + return inv, nil +} + +type MetricKV struct { + Name string + Labels map[string]string + Value float64 +} + +func AddMetricsToOptStatus(ctx context.Context, opt *v1alpha1.VariantAutoscaling, deployment appsv1.Deployment, acceleratorCostVal float64, promAPI promv1.API) error { + logger := logf.FromContext(ctx) + deployNamespace := deployment.Namespace + modelName := opt.Labels["inference.optimization/modelName"] + // Setup Prometheus client + // Query 1: Arrival rate (requests per minute) + arrivalQuery := fmt.Sprintf(`sum(rate(vllm:requests_count_total{model_name="%s",namespace="%s"}[1m])) * 60`, modelName, deployNamespace) + arrivalVal := 0.0 + if val, warn, err := promAPI.Query(ctx, arrivalQuery, time.Now()); err == nil && val.Type() == model.ValVector { + vec := val.(model.Vector) + if len(vec) > 0 { + arrivalVal = float64(vec[0].Value) + } + if warn != nil { + logger.Info("Prometheus warnings", "warnings", warn) + } + } else { + logger.Error(err, "failed to query Prometheus arrival rate") + } + + // Query 2: Average token length + tokenQuery := fmt.Sprintf(`delta(vllm:tokens_count_total{model_name="%s",namespace="%s"}[1m])/delta(vllm:requests_count_total{model_name="%s",namespace="%s"}[1m])`, modelName, deployNamespace, modelName, deployNamespace) + avgLen := 0.0 + if val, _, err := promAPI.Query(ctx, tokenQuery, time.Now()); err == nil && val.Type() == model.ValVector { + vec := val.(model.Vector) + if len(vec) > 0 { + avgLen = float64(vec[0].Value) + } + } else { + logger.Error(err, "failed to query Prometheus average token length") + } + + if math.IsNaN(avgLen) || math.IsInf(avgLen, 0) { + avgLen = 0 + } + + waitQuery := fmt.Sprintf(`sum(rate(vllm:request_queue_time_seconds_sum{model_name="%s",namespace="%s"}[1m]))/sum(rate(vllm:request_queue_time_seconds_count{model_name="%s",namespace="%s"}[1m]))`, modelName, deployNamespace, modelName, deployNamespace) + waitAverageTime := 0.0 + if val, _, err := promAPI.Query(ctx, waitQuery, time.Now()); err == nil && val.Type() == model.ValVector { + vec := val.(model.Vector) + if len(vec) > 0 { + waitAverageTime = float64(vec[0].Value) + } + } else { + logger.Error(err, "failed to query Prometheus average token length") + } + + opt.Status.CurrentAlloc.NumReplicas = int(*deployment.Spec.Replicas) + if acc, ok := opt.Labels["inference.optimization/acceleratorName"]; ok { + opt.Status.CurrentAlloc.Accelerator = acc + } else { + logger.Info("acceleratorName label not found on deployment", "deployment", deployment.Name) + } + opt.Status.CurrentAlloc.WaitAverage = strconv.FormatFloat(float64(waitAverageTime), 'f', 2, 32) + opt.Status.CurrentAlloc.ITLAverage = "50" + // TODO: extract max batch size from vllm config present + // present in the deployment + opt.Status.CurrentAlloc.MaxBatch = 256 + opt.Status.CurrentAlloc.Load.ArrivalRate = strconv.FormatFloat(float64(arrivalVal), 'f', 2, 32) + opt.Status.CurrentAlloc.Load.AvgLength = strconv.FormatFloat(float64(avgLen), 'f', 2, 32) + // TODO read configmap and adjust this value + discoveredCost := float64(*deployment.Spec.Replicas) * acceleratorCostVal + opt.Status.CurrentAlloc.VariantCost = strconv.FormatFloat(float64(discoveredCost), 'f', 2, 32) + return nil +} diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go new file mode 100644 index 000000000..1defde31f --- /dev/null +++ b/internal/controller/suite_test.go @@ -0,0 +1,116 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "os" + "path/filepath" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + llmdv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" + // +kubebuilder:scaffold:imports +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var ( + ctx context.Context + cancel context.CancelFunc + testEnv *envtest.Environment + cfg *rest.Config + k8sClient client.Client +) + +func TestControllers(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecs(t, "Controller Suite") +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + var err error + err = llmdv1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + + // Retrieve the first found binary directory to allow running tests from IDEs + if getFirstFoundEnvTestBinaryDir() != "" { + testEnv.BinaryAssetsDirectory = getFirstFoundEnvTestBinaryDir() + } + + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) + +// getFirstFoundEnvTestBinaryDir locates the first binary in the specified path. +// ENVTEST-based tests depend on specific binaries, usually located in paths set by +// controller-runtime. When running tests directly (e.g., via an IDE) without using +// Makefile targets, the 'BinaryAssetsDirectory' must be explicitly configured. +// +// This function streamlines the process by finding the required binaries, similar to +// setting the 'KUBEBUILDER_ASSETS' environment variable. To ensure the binaries are +// properly set up, run 'make setup-envtest' beforehand. +func getFirstFoundEnvTestBinaryDir() string { + basePath := filepath.Join("..", "..", "bin", "k8s") + entries, err := os.ReadDir(basePath) + if err != nil { + logf.Log.Error(err, "Failed to read directory", "path", basePath) + return "" + } + for _, entry := range entries { + if entry.IsDir() { + return filepath.Join(basePath, entry.Name()) + } + } + return "" +} diff --git a/internal/controller/variantautoscaling_controller.go b/internal/controller/variantautoscaling_controller.go new file mode 100644 index 000000000..dab92f2a2 --- /dev/null +++ b/internal/controller/variantautoscaling_controller.go @@ -0,0 +1,364 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "strconv" + "sync" + "time" + + "gopkg.in/yaml.v3" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + + llmdVariantAutoscalingV1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" + actuator "github.com/llm-d-incubation/inferno-autoscaler/internal/actuator" + collector "github.com/llm-d-incubation/inferno-autoscaler/internal/collector" + interfaces "github.com/llm-d-incubation/inferno-autoscaler/internal/interfaces" + analyzer "github.com/llm-d-incubation/inferno-autoscaler/internal/modelanalyzer" + variantAutoscalingOptimizer "github.com/llm-d-incubation/inferno-autoscaler/internal/optimizer" + "github.com/prometheus/client_golang/api" + promv1 "github.com/prometheus/client_golang/api/prometheus/v1" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// VariantAutoscalingReconciler reconciles a variantAutoscaling object +type VariantAutoscalingReconciler struct { + client.Client + Scheme *runtime.Scheme + + mu sync.Mutex + ticker *time.Ticker + stopTicker chan struct{} + + PromAPI promv1.API +} + +// +kubebuilder:rbac:groups=llmd.ai,resources=variantautoscalings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=llmd.ai,resources=variantautoscalings/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=llmd.ai,resources=variantautoscalings/finalizers,verbs=update +// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get;list;update;patch;watch +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;update;list;watch + +const ( + configMapName = "inferno-variantautoscaling-config" + configMapNamespace = "default" +) + +type ServiceClassEntry struct { + Model string `yaml:"model"` + SLOITL int `yaml:"slo-itl"` + SLOTTW int `yaml:"slo-ttw"` +} + +type ServiceClass struct { + Name string `yaml:"name"` + Priority int `yaml:"priority"` + Data []ServiceClassEntry `yaml:"data"` +} + +func (r *VariantAutoscalingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := logf.FromContext(ctx) + + serviceClassCm, err := r.readServiceClassConfig(ctx, "service-classes-config", "default") + if err != nil { + log.Log.Error(err, "unable to read serviceclass configmap, skipping optimiziing") + return ctrl.Result{}, nil + } + + acceleratorUnitCostCm, err := r.readServiceClassConfig(ctx, "accelerator-unit-costs", "default") + + // each variantAutoscaling CR corresponds to a variant which spawns exactly one deployment. + var variantAutoscalingList llmdVariantAutoscalingV1alpha1.VariantAutoscalingList + if err := r.List(ctx, &variantAutoscalingList); err != nil { + logger.Error(err, "unable to list variantAutoscaling resources") + return ctrl.Result{}, err + } + + newInventory, err := collector.CollectInventoryK8S(ctx, r.Client) + + if err == nil { + logger.Info("current inventory in the cluster", "capacity", newInventory) + } else { + logger.Error(err, "failed to get cluster inventory") + } + + for _, opt := range variantAutoscalingList.Items { + modelName := opt.Labels["inference.optimization/modelName"] + if modelName == "" { + logger.Info("variantAutoscaling missing modelName label, skipping optimization", "name", opt.Name) + return ctrl.Result{}, err + } + + entry, className, err := findModelSLO(serviceClassCm, modelName) + if err != nil { + logger.Error(err, "failed to locate SLO for model") + return ctrl.Result{}, nil + } + + logger.Info("Found SLO", "model", entry.Model, "class", className, "slo-itl", entry.SLOITL, "slo-ttw", entry.SLOTTW) + + acceleratorCostVal, ok := acceleratorUnitCostCm["A100"] + if !ok { + logger.Info("variantAutoscaling missing accelerator cost in configmap, skipping optimization", "name", opt.Name) + } + acceleratorCostValFloat, err := strconv.ParseFloat(acceleratorCostVal, 32) + if err != nil { + logger.Info("variantAutoscaling unable to parse accelerator cost in configmap, skipping optimization", "name", opt.Name) + } + // Check if Deployment exists for this variantAutoscaling + var deploy appsv1.Deployment + err = r.Get(ctx, types.NamespacedName{ + Name: opt.Name, + Namespace: opt.Namespace, + }, &deploy) + if err != nil { + if apierrors.IsNotFound(err) { + continue + } + logger.Error(err, "failed to get Deployment", "variantAutoscaling", opt.Name) + return ctrl.Result{}, err + } + + var updateOpt llmdVariantAutoscalingV1alpha1.VariantAutoscaling + if err := r.Get(ctx, client.ObjectKey{Name: deploy.Name, Namespace: deploy.Namespace}, &updateOpt); err != nil { + logger.Error(err, "unable to get variantAutoscaling") + } + + original := updateOpt.DeepCopy() + + // Add OwnerReference if not already set + if !metav1.IsControlledBy(&updateOpt, &deploy) { + updateOpt.OwnerReferences = append(updateOpt.OwnerReferences, metav1.OwnerReference{ + APIVersion: deploy.APIVersion, + Kind: deploy.Kind, + Name: deploy.Name, + UID: deploy.UID, + Controller: ptr(true), + BlockOwnerDeletion: ptr(true), + }) + } + + err = collector.AddMetricsToOptStatus(ctx, &updateOpt, deploy, acceleratorCostValFloat, r.PromAPI) + + if err != nil { + logger.Error(err, "unable to fetch metrics, skipping this variantAutoscaling loop") + return ctrl.Result{}, nil + } + dummyQps := 50.0 + metrics := interfaces.MetricsSnapshot{ + ActualQPS: dummyQps, + } + dummyAnalyzer := analyzer.NewSimplePrefillDecodeAnalyzer() + dummyModelAnalyzerResponse, err := dummyAnalyzer.AnalyzeModel(ctx, updateOpt, metrics) + if err != nil { + logger.Error(err, "unable to perform model optimization, skipping this variantAutoscaling loop") + return ctrl.Result{}, nil + } + logger.Info("response from model analyzer", "data", dummyModelAnalyzerResponse) + dummyvariantAutoscaling := variantAutoscalingOptimizer.NewDummyVariantAutoscalingsEngine() + optimizedAllocation, err := dummyvariantAutoscaling.Optimize(ctx, opt, *dummyModelAnalyzerResponse, metrics) + if err != nil { + logger.Error(err, "unable to perform model optimization, skipping this variantAutoscaling loop") + return ctrl.Result{}, nil + } + updateOpt.Status.DesiredOptimizedAlloc = optimizedAllocation + patch := client.MergeFrom(original.DeepCopy()) + if err := r.Client.Patch(ctx, &updateOpt, patch); err != nil { + logger.Error(err, "failed to patch status") + } + dummyActuator := actuator.NewDummyActuator(r.Client) + err = dummyActuator.ApplyReplicaTargets(ctx, &opt) + if err != nil { + logger.Error(err, "unable to change replicas", "deployment", deploy.Name) + } + } + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *VariantAutoscalingReconciler) SetupWithManager(mgr ctrl.Manager) error { + // Start watching ConfigMap and ticker logic + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + <-mgr.Elected() // Wait for leader election + r.watchAndRunLoop() + return nil + })); err != nil { + return err + } + + client, err := api.NewClient(api.Config{ + Address: "http://prometheus-operated.default.svc.cluster.local:9090", + }) + if err != nil { + return fmt.Errorf("failed to create prometheus client: %w", err) + } + + r.PromAPI = promv1.NewAPI(client) + + return ctrl.NewControllerManagedBy(mgr). + For(&llmdVariantAutoscalingV1alpha1.VariantAutoscaling{}). + Named("variantAutoscaling"). + Complete(r) +} + +func (r *VariantAutoscalingReconciler) watchAndRunLoop() { + var lastInterval string + + for { + cm := &corev1.ConfigMap{} + err := r.Get(context.Background(), types.NamespacedName{ + Name: configMapName, + Namespace: configMapNamespace, + }, cm) + if err != nil { + logf.Log.Error(err, "Unable to read optimization config") + time.Sleep(30 * time.Second) + continue + } + + interval := cm.Data["GLOBAL_OPT_INTERVAL"] + trigger := cm.Data["GLOBAL_OPT_TRIGGER"] + + // Handle manual trigger + if trigger == "true" { + logf.Log.Info("Manual optimization trigger received") + _, err := r.Reconcile(context.Background(), ctrl.Request{}) + if err != nil { + logf.Log.Error(err, "Manual reconcile failed") + } + + // Reset trigger in ConfigMap + cm.Data["GLOBAL_OPT_TRIGGER"] = "false" + if err := r.Update(context.Background(), cm); err != nil { + logf.Log.Error(err, "Failed to reset GLOBAL_OPT_TRIGGER") + } + } + + r.mu.Lock() + if interval != lastInterval { + // Stop previous ticker if any + if r.stopTicker != nil { + close(r.stopTicker) + } + + if interval != "" { + d, err := time.ParseDuration(interval) + if err != nil { + logf.Log.Error(err, "Invalid GLOBAL_OPT_INTERVAL") + r.mu.Unlock() + continue + } + + r.stopTicker = make(chan struct{}) + ticker := time.NewTicker(d) + r.ticker = ticker + + go func(stopCh <-chan struct{}, tick <-chan time.Time) { + for { + select { + case <-tick: + _, err := r.Reconcile(context.Background(), ctrl.Request{}) + if err != nil { + logf.Log.Error(err, "Manual reconcile failed") + } + case <-stopCh: + return + } + } + }(r.stopTicker, ticker.C) + + logf.Log.Info("Started periodic optimization ticker", "interval", interval) + } else { + r.ticker = nil + logf.Log.Info("GLOBAL_OPT_INTERVAL unset, disabling periodic optimization") + } + lastInterval = interval + } + r.mu.Unlock() + + time.Sleep(10 * time.Second) + } +} + +func (r *VariantAutoscalingReconciler) readServiceClassConfig(ctx context.Context, cmName, cmNamespace string) (map[string]string, error) { + logger := log.FromContext(ctx) + + var cm corev1.ConfigMap + backoff := wait.Backoff{ + Duration: 100 * time.Millisecond, + Factor: 2.0, + Jitter: 0.1, + Steps: 5, + } + + err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) { + err := r.Get(ctx, client.ObjectKey{Name: cmName, Namespace: cmNamespace}, &cm) + if err == nil { + return true, nil + } + + if apierrors.IsNotFound(err) { + logger.Error(err, "ConfigMap not found, will not retry", "name", cmName, "namespace", cmNamespace) + return false, err + } + + logger.Error(err, "Transient error fetching ConfigMap, retrying...") + return false, nil + }) + + if err != nil { + return nil, fmt.Errorf("failed to read ConfigMap %s/%s: %w", cmNamespace, cmName, err) + } + + return cm.Data, nil +} + +func findModelSLO(cmData map[string]string, targetModel string) (*ServiceClassEntry, string /* class name */, error) { + for key, val := range cmData { + var sc ServiceClass + if err := yaml.Unmarshal([]byte(val), &sc); err != nil { + return nil, "", fmt.Errorf("failed to parse %s: %w", key, err) + } + + for _, entry := range sc.Data { + if entry.Model == targetModel { + return &entry, sc.Name, nil + } + } + } + return nil, "", fmt.Errorf("model %q not found in any service class", targetModel) +} + +func ptr[T any](v T) *T { + return &v +} diff --git a/internal/controller/variantautoscaling_controller_test.go b/internal/controller/variantautoscaling_controller_test.go new file mode 100644 index 000000000..2a4a1a2f1 --- /dev/null +++ b/internal/controller/variantautoscaling_controller_test.go @@ -0,0 +1,82 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + llmdVariantAutoscalingV1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" +) + +var _ = Describe("VariantAutoscalings Controller", func() { + Context("When reconciling a resource", func() { + const resourceName = "test-resource" + + ctx := context.Background() + + typeNamespacedName := types.NamespacedName{ + Name: resourceName, + Namespace: "default", + } + VariantAutoscalings := &llmdVariantAutoscalingV1alpha1.VariantAutoscaling{} + + BeforeEach(func() { + By("creating the custom resource for the Kind VariantAutoscalings") + err := k8sClient.Get(ctx, typeNamespacedName, VariantAutoscalings) + if err != nil && errors.IsNotFound(err) { + resource := &llmdVariantAutoscalingV1alpha1.VariantAutoscaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceName, + Namespace: "default", + }, + // TODO(user): Specify other spec details if needed. + } + Expect(k8sClient.Create(ctx, resource)).To(Succeed()) + } + }) + + AfterEach(func() { + // TODO(user): Cleanup logic after each test, like removing the resource instance. + resource := &llmdVariantAutoscalingV1alpha1.VariantAutoscaling{} + err := k8sClient.Get(ctx, typeNamespacedName, resource) + Expect(err).NotTo(HaveOccurred()) + + By("Cleanup the specific resource instance VariantAutoscalings") + Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) + }) + It("should successfully reconcile the resource", func() { + By("Reconciling the created resource") + controllerReconciler := &VariantAutoscalingReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + }) + }) +}) diff --git a/internal/interfaces/interfaces.go b/internal/interfaces/interfaces.go new file mode 100644 index 000000000..3792ed227 --- /dev/null +++ b/internal/interfaces/interfaces.go @@ -0,0 +1,41 @@ +package controller + +import ( + "context" + + llmdOptv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" +) + +// VariantAutoscalingsEngine defines the interface for the optimization engine. +type VariantAutoscalingsEngine interface { + Optimize( + ctx context.Context, + va llmdOptv1alpha1.VariantAutoscaling, + analysis ModelAnalyzeResponse, + metrics MetricsSnapshot, + ) (llmdOptv1alpha1.OptimizedAlloc, error) +} + +// ModelAnalyzer defines the interface for model analysis. +type ModelAnalyzer interface { + AnalyzeModel( + ctx context.Context, + va llmdOptv1alpha1.VariantAutoscaling, + metrics MetricsSnapshot, + ) (*ModelAnalyzeResponse, error) +} + +type Actuator interface { + // ApplyReplicaTargets mutates workloads (e.g., Deployments, InferenceServices) to match target replicas. + // To be deprecated + ApplyReplicaTargets( + ctx context.Context, + VariantAutoscalings *llmdOptv1alpha1.VariantAutoscaling, + ) error + + // EmitMetrics publishes metrics about the target state (e.g., desired replicas, reasons). + EmitMetrics( + ctx context.Context, + VariantAutoscalings *llmdOptv1alpha1.VariantAutoscaling, + ) error +} diff --git a/internal/interfaces/types.go b/internal/interfaces/types.go new file mode 100644 index 000000000..acbd0530d --- /dev/null +++ b/internal/interfaces/types.go @@ -0,0 +1,11 @@ +package controller + +type ModelAnalyzeResponse struct { + RequiredPrefillQPS float64 + RequiredDecodeQPS float64 + Reason string +} + +type MetricsSnapshot struct { + ActualQPS float64 +} diff --git a/internal/modelanalyzer/README.md b/internal/modelanalyzer/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/internal/modelanalyzer/dummy_analyzer.go b/internal/modelanalyzer/dummy_analyzer.go new file mode 100644 index 000000000..4f3f62d6b --- /dev/null +++ b/internal/modelanalyzer/dummy_analyzer.go @@ -0,0 +1,43 @@ +package controller + +import ( + "context" + "fmt" + + llmdOptv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" + interfaces "github.com/llm-d-incubation/inferno-autoscaler/internal/interfaces" +) + +// SimplePrefillDecodeAnalyzer just returns prefill/decode demand. +type SimplePrefillDecodeAnalyzer struct{} + +// NewSimplePrefillDecodeAnalyzer returns the analyzer. +func NewSimplePrefillDecodeAnalyzer() *SimplePrefillDecodeAnalyzer { + return &SimplePrefillDecodeAnalyzer{} +} + +// AnalyzeModel calculates required prefill/decode QPS from ActualQPS. +func (a *SimplePrefillDecodeAnalyzer) AnalyzeModel( + ctx context.Context, + spec llmdOptv1alpha1.VariantAutoscaling, + metrics interfaces.MetricsSnapshot, +) (*interfaces.ModelAnalyzeResponse, error) { + // dummy traffic shape: 40% prefill, 60% decode + prefillRatio := 0.4 + decodeRatio := 0.6 + + requiredPrefill := metrics.ActualQPS * prefillRatio + requiredDecode := metrics.ActualQPS * decodeRatio + + reason := fmt.Sprintf( + "Split ActualQPS %.2f into prefill %.2f and decode %.2f (fixed ratio %.0f/%.0f)", + metrics.ActualQPS, requiredPrefill, requiredDecode, + prefillRatio*100, decodeRatio*100, + ) + + return &interfaces.ModelAnalyzeResponse{ + RequiredPrefillQPS: requiredPrefill, + RequiredDecodeQPS: requiredDecode, + Reason: reason, + }, nil +} diff --git a/internal/optimizer/README.md b/internal/optimizer/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/internal/optimizer/dummy_optimizer.go b/internal/optimizer/dummy_optimizer.go new file mode 100644 index 000000000..810d42143 --- /dev/null +++ b/internal/optimizer/dummy_optimizer.go @@ -0,0 +1,48 @@ +package controller + +import ( + "context" + "math" + "time" + + llmdOptv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1" + interfaces "github.com/llm-d-incubation/inferno-autoscaler/internal/interfaces" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type DummyVariantAutoscalingsEngine struct{} + +func NewDummyVariantAutoscalingsEngine() *DummyVariantAutoscalingsEngine { + return &DummyVariantAutoscalingsEngine{} +} + +// Optimize implements dummy logic to produce one OptimizedAlloc in status. +func (e *DummyVariantAutoscalingsEngine) Optimize( + ctx context.Context, + va llmdOptv1alpha1.VariantAutoscaling, + analysis interfaces.ModelAnalyzeResponse, + metrics interfaces.MetricsSnapshot, +) (llmdOptv1alpha1.OptimizedAlloc, error) { + + var totalPrefillQPS, totalDecodeQPS float64 + + totalPrefillQPS = analysis.RequiredPrefillQPS + totalDecodeQPS = analysis.RequiredDecodeQPS + + // Dummy per-replica capacities + perReplicaPrefill := 100.0 + perReplicaDecode := 300.0 + + // Determine required replicas + replicasPrefill := math.Ceil(totalPrefillQPS / perReplicaPrefill) + replicasDecode := math.Ceil(totalDecodeQPS / perReplicaDecode) + replicaTarget := int(math.Max(replicasPrefill, replicasDecode)) + + alloc := llmdOptv1alpha1.OptimizedAlloc{ + LastRunTime: metav1.NewTime(time.Now()), + Accelerator: "A100", // or read from VariantAutoscalings spec / label if available + NumReplicas: replicaTarget, + } + + return alloc, nil +} diff --git a/manifests/yamls/deploy-optimizer.yaml b/manifests/yamls/deploy-optimizer.yaml deleted file mode 100644 index 7a9089abe..000000000 --- a/manifests/yamls/deploy-optimizer.yaml +++ /dev/null @@ -1,51 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: inferno ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: inferno-optimizer - namespace: inferno -spec: - replicas: 1 - selector: - matchLabels: - app: inferno-optimizer - template: - metadata: - labels: - app: inferno-optimizer - spec: - containers: - - name: optimizer - image: quay.io/atantawi/inferno-optimizer:latest - imagePullPolicy: IfNotPresent - env: - - name: INFERNO_PORT - value: "3302" - ports: - - containerPort: 3302 - command: ["optimizer"] - args: ["-F"] - resources: - requests: - memory: "512Mi" - cpu: "500m" - limits: - memory: "2Gi" - cpu: "1" ---- -apiVersion: v1 -kind: Service -metadata: - name: inferno-optimizer - namespace: inferno -spec: - selector: - app: inferno-optimizer - ports: - - protocol: TCP - port: 80 - targetPort: 3302 diff --git a/metrics.yaml b/metrics.yaml new file mode 100644 index 000000000..0d479d508 --- /dev/null +++ b/metrics.yaml @@ -0,0 +1,289 @@ +{ + "metrics": { + "vllm-opt125m-deployment-vllm-57cc4cddb8-xjchm": [ + { + "Name": "vllm:e2e_request_latency_seconds_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.848804 + }, + { + "Name": "vllm:request_params_n_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.848498 + }, + { + "Name": "vllm:gpu_prefix_cache_hits_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.847717 + }, + { + "Name": "vllm:request_generation_tokens_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8480692 + }, + { + "Name": "vllm:request_max_num_generation_tokens_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.848424 + }, + { + "Name": "vllm:request_inference_time_seconds_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8489597 + }, + { + "Name": "vllm:gpu_prefix_cache_queries_total", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:iteration_tokens_total_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8481336 + }, + { + "Name": "vllm:generation_tokens_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8477938 + }, + { + "Name": "vllm:request_success_total", + "Labels": { + "engine": "0", + "finished_reason": "stop", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:request_success_total", + "Labels": { + "engine": "0", + "finished_reason": "length", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:request_success_total", + "Labels": { + "engine": "0", + "finished_reason": "abort", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:request_decode_time_seconds_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8491101 + }, + { + "Name": "vllm:gpu_prefix_cache_hits_total", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:num_preemptions_total", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:request_prefill_time_seconds_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8490357 + }, + { + "Name": "vllm:num_requests_waiting", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:gpu_cache_usage_perc", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:time_to_first_token_seconds_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.848614 + }, + { + "Name": "vllm:time_per_output_token_seconds_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8487253 + }, + { + "Name": "vllm:prompt_tokens_total", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:prompt_tokens_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.847768 + }, + { + "Name": "vllm:cache_config_info", + "Labels": { + "block_size": "16", + "cache_dtype": "auto", + "calculate_kv_scales": "False", + "cpu_offload_gb": "0.0", + "enable_prefix_caching": "True", + "engine": "0", + "gpu_memory_utilization": "0.9", + "is_attention_free": "False", + "num_cpu_blocks": "None", + "num_gpu_blocks": "63593", + "num_gpu_blocks_override": "None", + "prefix_caching_hash_algo": "builtin", + "sliding_window": "None", + "swap_space": "4.0", + "swap_space_bytes": "4294967296.0" + }, + "Value": 1 + }, + { + "Name": "vllm:num_requests_running", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:request_queue_time_seconds_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8488834 + }, + { + "Name": "vllm:request_params_max_tokens_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8485537 + }, + { + "Name": "vllm:request_prompt_tokens_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.847957 + }, + { + "Name": "vllm:num_preemptions_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8477442 + }, + { + "Name": "vllm:request_success_created", + "Labels": { + "engine": "0", + "finished_reason": "stop", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8478458 + }, + { + "Name": "vllm:request_success_created", + "Labels": { + "engine": "0", + "finished_reason": "length", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8478625 + }, + { + "Name": "vllm:request_success_created", + "Labels": { + "engine": "0", + "finished_reason": "abort", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8478768 + }, + { + "Name": "vllm:generation_tokens_total", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 0 + }, + { + "Name": "vllm:gpu_prefix_cache_queries_created", + "Labels": { + "engine": "0", + "model_name": "facebook/opt-125m" + }, + "Value": 1751829590.8476853 + } + ] + } +} \ No newline at end of file diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go deleted file mode 100644 index abfe2706d..000000000 --- a/pkg/config/defaults.go +++ /dev/null @@ -1,34 +0,0 @@ -package config - -import "math" - -/** - * Parameters - */ - -// Tolerated percentile for SLOs -var SLOPercentile = 0.95 - -// Multiplier of average of exponential distribution to attain percentile -var SLOMargin = -float32(math.Log(1 - SLOPercentile)) - -// small disturbance around a value -var Delta = float32(0.001) - -// maximum number of requests in queueing system as multiples of maximum batch size -var MaxQueueToBatchRatio = 10 - -// accelerator transition penalty factor -var AccelPenaltyFactor = float32(0.1) - -// default name of a service class -const DefaultServiceClassName string = "Free" - -// default priority of a service class -const DefaultServiceClassPriority int = 0 - -// weight factor for class priority used in greedy limited solver -var PriorityWeightFactor float32 = 1.0 - -// fraction of maximum server throughput to provide stability (running this fraction below the maximum) -var StabilitySafetyFraction float32 = 0.1 diff --git a/pkg/config/types.go b/pkg/config/types.go deleted file mode 100644 index b59842813..000000000 --- a/pkg/config/types.go +++ /dev/null @@ -1,137 +0,0 @@ -package config - -// TODO: add json validation and default values - -// All data related to the system (accelerators, models, service classes, ...) -type SystemData struct { - Spec SystemSpec `json:"system"` -} - -// Specifications for system data -type SystemSpec struct { - // static data - Accelerators AcceleratorData `json:"acceleratorData"` // accelerator data - Models ModelData `json:"modelData"` // model data - ServiceClasses ServiceClassData `json:"serviceClassData"` // service class data - Servers ServerData `json:"serverData"` // server data - Optimizer OptimizerData `json:"optimizerData"` // optimizer data - - // dynamic data - Capacity CapacityData `json:"capacityData"` // data about accelerator type availability -} - -// Data related to an Accelerator -type AcceleratorData struct { - Spec []AcceleratorSpec `json:"accelerators"` // accelerator specs -} - -// Specifications for accelerator data -type AcceleratorSpec struct { - Name string `json:"name"` // name of accelerator - Type string `json:"type"` // name of accelerator type (e.g. A100) - Multiplicity int `json:"multiplicity"` // number of cards of type for this accelerator - MemSize int `json:"memSize"` // GB - MemBW int `json:"memBW"` // GB/sec - Power PowerSpec `json:"power"` // power consumption specs - Cost float32 `json:"cost"` // cents/hr -} - -// Specifications for Accelerator power consumption data (Watts) -type PowerSpec struct { - Idle int `json:"idle"` // idle power - Full int `json:"full"` // full utilization power - MidPower int `json:"midPower"` // power at inflection point - MidUtil float32 `json:"midUtil"` // utilization at inflection point -} - -// Data about accelerator type availability -type CapacityData struct { - Count []AcceleratorCount `json:"count"` // count of accelerator types -} - -// Count of accelerator types in the system -type AcceleratorCount struct { - Type string `json:"type"` // name of accelerator type - Count int `json:"count"` // number of available units -} - -// Data related to a Model -type ModelData struct { - PerfData []ModelAcceleratorPerfData `json:"models"` // performance data for model on accelerators -} - -// Specifications for a combination of a model and accelerator data -type ModelAcceleratorPerfData struct { - Name string `json:"name"` // model name - Acc string `json:"acc"` // accelerator name - AccCount int `json:"accCount"` // number of accelerator units used by model - Alpha float32 `json:"alpha"` // alpha parameter of ITL - Beta float32 `json:"beta"` // beta parameter of ITL - MaxBatchSize int `json:"maxBatchSize"` // max batch size based on average number of tokens per request - AtTokens int `json:"atTokens"` // average number of tokens per request assumed in max batch size calculation -} - -// Data related to a service class SLOs -type ServiceClassData struct { - Spec []ServiceClassSpec `json:"serviceClasses"` -} - -// Specifications of SLO data for a combination of a service class and a model -type ServiceClassSpec struct { - Name string `json:"name"` // service class name - Model string `json:"model"` // model name - Priority int `json:"priority"` // (non-negative) priority (lower value is higher priority) - SLO_ITL float32 `json:"slo-itl"` // inter-token latency (msec) - SLO_TTW float32 `json:"slo-ttw"` // request waiting time (msec) - SLO_TPS float32 `json:"slo-tps"` // throughput (tokens/sec) -} - -// Data related to a Server -type ServerData struct { - Spec []ServerSpec `json:"servers"` -} - -// Specifications of a server -type ServerSpec struct { - Name string `json:"name"` // server name - Class string `json:"class"` // service class name - Model string `json:"model"` // model name - CurrentAlloc AllocationData `json:"currentAlloc"` // current allocation - DesiredAlloc AllocationData `json:"desiredAlloc"` // desired allocation -} - -// Specifications of server load statistics -type ServerLoadSpec struct { - ArrivalRate float32 `json:"arrivalRate"` // req/min - AvgLength int `json:"avgLength"` // number of tokens - ArrivalCOV float32 `json:"arrivalCOV"` // coefficient of variation of inter-request arrival time - ServiceCOV float32 `json:"serviceCOV"` // coefficient of variation of request service time -} - -// Data related to Optimizer -type OptimizerData struct { - Spec OptimizerSpec `json:"optimizer"` -} - -// Specifications for optimizer data -type OptimizerSpec struct { - Unlimited bool `json:"unlimited"` // unlimited number of accelerator types (for capacity planning and/or cloud) - Heterogeneous bool `json:"heterogeneous"` // heterogeneous accelerators assigned to same inference server - MILPSolver bool `json:"milpSolver"` // use MILP solver to optimize - UseCplex bool `json:"useCplex"` // use CPLEX solver for MILP problem -} - -type AllocationSolution struct { - Spec map[string]AllocationData `json:"allocations"` // map of server names to allocation data -} - -// Data about a server allocation -type AllocationData struct { - Accelerator string `json:"accelerator"` // accelerator name - NumReplicas int `json:"numReplicas"` // number of replicas - MaxBatch int `json:"maxBatch"` // max batch size - Cost float32 `json:"cost"` // cost of allocation - ITLAverage float32 `json:"itlAverage"` // average ITL - WaitAverage float32 `json:"waitAverage"` // average wait time - Load ServerLoadSpec `json:"load"` // server load statistics -} diff --git a/pkg/core/accelerator.go b/pkg/core/accelerator.go deleted file mode 100644 index 306b3c668..000000000 --- a/pkg/core/accelerator.go +++ /dev/null @@ -1,71 +0,0 @@ -package core - -import ( - "fmt" - - "github.com/llm-inferno/inferno/pkg/config" -) - -// An accelerator used in an inference server -// - full or multiple GPU units (cards) -type Accelerator struct { - name string - spec *config.AcceleratorSpec - - // power profile slope at low utilization - slopeLow float32 - // power profile slope at high utilization - slopeHigh float32 -} - -func NewAcceleratorFromSpec(spec *config.AcceleratorSpec) *Accelerator { - return &Accelerator{ - name: spec.Name, - spec: spec, - } -} - -// Calculate basic parameters -func (g *Accelerator) Calculate() { - g.slopeLow = float32(g.spec.Power.MidPower-g.spec.Power.Idle) / g.spec.Power.MidUtil - g.slopeHigh = float32(g.spec.Power.Full-g.spec.Power.MidPower) / (1 - g.spec.Power.MidUtil) -} - -// Evaluate power consumption at a given utilization -func (g *Accelerator) Power(util float32) float32 { - if util <= g.spec.Power.MidUtil { - return float32(g.spec.Power.Idle) + g.slopeLow*util - } else { - return float32(g.spec.Power.MidPower) + g.slopeHigh*(util-g.spec.Power.MidUtil) - } -} - -func (g *Accelerator) Name() string { - return g.name -} - -func (g *Accelerator) Spec() *config.AcceleratorSpec { - return g.spec -} - -func (g *Accelerator) Type() string { - return g.spec.Type -} - -func (g *Accelerator) Cost() float32 { - return g.spec.Cost -} - -func (g *Accelerator) Multiplicity() int { - return g.spec.Multiplicity -} - -func (g *Accelerator) MemSize() int { - return g.spec.MemSize -} - -func (g *Accelerator) String() string { - return fmt.Sprintf("Accelerator: name=%s; type=%s; multiplicity=%d; memSize=%d; memBW=%d; cost=%v; power={ %d, %d, %d @ %v }", - g.name, g.spec.Type, g.spec.Multiplicity, g.spec.MemSize, g.spec.MemBW, g.spec.Cost, - g.spec.Power.Idle, g.spec.Power.Full, g.spec.Power.MidPower, g.spec.Power.MidUtil) -} diff --git a/pkg/core/allocation.go b/pkg/core/allocation.go deleted file mode 100644 index 0cacf3542..000000000 --- a/pkg/core/allocation.go +++ /dev/null @@ -1,436 +0,0 @@ -package core - -import ( - "bytes" - "fmt" - "math" - - "github.com/llm-inferno/inferno/pkg/config" - "github.com/llm-inferno/queue-analysis/pkg/queue" - "github.com/llm-inferno/queue-analysis/pkg/utils" -) - -// Allocation details of an accelerator to a server -type Allocation struct { - accelerator string // name of accelerator - numReplicas int // number of server replicas - batchSize int // max batch size - cost float32 // cost of this allocation - value float32 // value of this allocation - servTime float32 // expected average token service time - waitTime float32 // expected average request queueing time - rho float32 // expected busy server defined as (1 - probability of at least one request running) - - maxArrvRatePerReplica float32 // maximum arrival rate per replica -} - -// queueing model used in performance analysis -var queueModel *queue.MM1ModelStateDependent - -// Create an allocation of an accelerator to a server; nil if not feasible -func CreateAllocation(serverName string, gName string) *Allocation { - var ( - acc *Accelerator - - server *Server - load *config.ServerLoadSpec - - model *Model - perf *config.ModelAcceleratorPerfData - - svc *ServiceClass - target *Target - ) - - // get accelerator info - if acc = GetAccelerator(gName); acc == nil { - return nil - } - - // get server info - if server = GetServer(serverName); server == nil { - return nil - } - if load = server.Load(); load == nil || load.ArrivalRate <= 0 || load.AvgLength <= 0 { - return nil - } - - // get model info - modelName := server.ModelName() - if model = GetModel(modelName); model == nil { - return nil - } - if perf = model.PerfData(gName); perf == nil { - return nil - } - - // get service class info - if svc = GetServiceClass(server.ServiceClassName()); svc == nil { - return nil - } - if target = svc.ModelTarget(modelName); target == nil { - return nil - } - - // calculate max batch size (N) based on average request length (K) - K := load.AvgLength - N := max(perf.MaxBatchSize*perf.AtTokens/K, 1) - maxQueue := N * config.MaxQueueToBatchRatio - - // distribution of token time assumed deterministic - servTimeLimit := float32(K) * target.ITL - // distribution of waiting time assumed exponential - waitTimeLimit := target.TTW / config.SLOMargin - // desired throughput (requests/msec) - throughputLimit := target.TPS / (1000 * float32(K)) - - // calculate state-dependent service rate for queueuing model - servRate := make([]float32, N) - for n := 1; n <= N; n++ { - servTime := perf.Alpha + perf.Beta*float32(n) - servRate[n-1] = float32(n) / (servTime * float32(K)) - } - - // analyze queueuing model - queueModel = queue.NewMM1ModelStateDependent(maxQueue, servRate) - lambdaMin := servRate[0] * config.Delta - lambdaMax := servRate[N-1] * (1 - config.Delta) - - // determine rate at which the average service time is below the service time limit - lambdaStarService := lambdaMax - if target.ITL > 0 { - lambda, ind, err := utils.BinarySearch(lambdaMin, lambdaMax, servTimeLimit, EvalServTime) - if err != nil { - fmt.Println(err.Error()) - return nil - } - if ind < 0 { - return nil // unattainable service time limit - } - lambdaStarService = lambda - } - - // determine rate at which the average waiting time is below to the waiting time limit - lambdaStarWait := lambdaMax - if target.TTW > 0 { - lambda, ind, err := utils.BinarySearch(lambdaMin, lambdaMax, waitTimeLimit, EvalWaitingTime) - if err != nil { - fmt.Println(err.Error()) - return nil - } - if ind < 0 { - return nil // unattainable waiting time limit - } - lambdaStarWait = lambda - } - - // determine rate for max throughput - lambdaStarThroughput := lambdaMax - if target.TPS > 0 { - lambdaStarThroughput = lambdaMax * (1 - config.StabilitySafetyFraction) - } - - // arrival rate satisfying all SLOs - lambdaStar := float32(math.Min(float64(lambdaStarService), float64(lambdaStarWait))) - lambdaStar = float32(math.Min(float64(lambdaStar), float64(lambdaStarThroughput))) - - // calculate number of replicas - var totalLambda float32 - if target.TPS == 0 { - totalLambda = load.ArrivalRate / 60 / 1000 - } else { - totalLambda = throughputLimit - } - numReplicas := int(math.Ceil(float64(totalLambda) / float64(lambdaStar))) - - // calculate cost - totalNumInstances := model.NumInstances(gName) * numReplicas - cost := acc.Cost() * float32(totalNumInstances) - - // queueModel.Solve(lambdaStar, 1) - // fmt.Printf("model=%s; accelerator=%s; lambdaMin=%v; lambdaMax=%v; servTimeLimit= %v; waitTimeLimit=%v; lambdaStarService=%v; lambdaStarWait=%v; lambdaStarThroughput= %v, lambdaStar=%v \n", - // model.Name(), gName, - // lambdaMin, lambdaMax, servTimeLimit, waitTimeLimit, lambdaStarService, lambdaStarWait, lambdaStarThroughput, lambdaStar) - // fmt.Println(queueModel) - - // calculate queue statistics - lambda := totalLambda / float32(numReplicas) - queueModel.Solve(lambda, 1) - rho := queueModel.GetRho() - servTime := queueModel.GetAvgServTime() / float32(K) - wait := queueModel.GetAvgWaitTime() - // fmt.Printf("numReplicas=%d; batchSize=%d; lambda=%v, tokenTime=%v; wait=%v; \n", numReplicas, N, lambda, servTime, wait) - - alloc := &Allocation{accelerator: gName, numReplicas: numReplicas, batchSize: N, - cost: cost, servTime: servTime, waitTime: wait, rho: rho, maxArrvRatePerReplica: lambdaStar} - alloc.SetValue(alloc.cost) - return alloc -} - -func EvalWaitingTime(x float32) (float32, error) { - queueModel.Solve(x, 1) - if !queueModel.IsValid() { - return 0, fmt.Errorf("invalid model %v", queueModel) - } - return queueModel.GetAvgWaitTime(), nil -} - -func EvalServTime(x float32) (float32, error) { - queueModel.Solve(x, 1) - if !queueModel.IsValid() { - return 0, fmt.Errorf("invalid model %v", queueModel) - } - return queueModel.GetAvgServTime(), nil -} - -// Create an allocation for an accelerator to a server; nil if not feasible -// (using G/G/m model approximation) -func CreateAllocationUsingGGm(serverName string, gName string) *Allocation { - var ( - acc *Accelerator - - server *Server - load *config.ServerLoadSpec - - model *Model - perf *config.ModelAcceleratorPerfData - - svc *ServiceClass - target *Target - ) - - // get accelerator info - if acc = GetAccelerator(gName); acc == nil { - return nil - } - - // get server info - if server = GetServer(serverName); server == nil { - return nil - } - if load = server.Load(); load == nil { - return nil - } - - // get model info - modelName := server.ModelName() - if model = GetModel(modelName); model == nil { - return nil - } - if perf = model.PerfData(gName); perf == nil { - return nil - } - - // get service class info - if svc = GetServiceClass(server.ServiceClassName()); svc == nil { - return nil - } - if target = svc.ModelTarget(modelName); target == nil { - return nil - } - - K := load.AvgLength - N := max(perf.MaxBatchSize*perf.AtTokens/K, 1) - - servTime := perf.Alpha + perf.Beta*float32(N) - if target.ITL > 0 && servTime > target.ITL { - return nil - } - - numReplicas := 0 - gamma := ((load.ArrivalCOV * load.ArrivalCOV) + (load.ServiceCOV * load.ServiceCOV)) / 2 - if target.ITL > 0 && target.TTW > 0 { - waitTimeLimit := target.TTW / config.SLOMargin - xStar := float32(perf.MaxBatchSize) * waitTimeLimit / (float32(K) * servTime * gamma) - rhoStar := xStar / (1 + xStar) - lambdaStar := rhoStar / (float32(K) * servTime) - numReplicas = int(math.Ceil(float64(load.ArrivalRate) / (float64(lambdaStar) * 60 * 1000))) - } - if target.TPS > 0 { - lambdaMax := float32(N) / (servTime * float32(K)) - lambdaStarThroughput := lambdaMax * (1 - config.StabilitySafetyFraction) - throughputTarget := target.TPS / (1000 * float32(K)) - n := int(math.Ceil(float64(throughputTarget) / float64(lambdaStarThroughput))) - numReplicas = max(numReplicas, n) - } - if numReplicas == 0 { - return nil - } - - // calculate cost - totalNumInstances := model.NumInstances(gName) * numReplicas - cost := acc.Cost() * float32(totalNumInstances) - - rho := load.ArrivalRate * float32(K) * servTime / (float32(numReplicas) * 60 * 1000) - x := rho / (1 - rho) - wait := (float32(K) * servTime) * gamma * x / float32(perf.MaxBatchSize) - - alloc := &Allocation{accelerator: gName, numReplicas: numReplicas, batchSize: N, - cost: cost, servTime: servTime, waitTime: wait, rho: rho} - alloc.SetValue(alloc.cost) - return alloc -} - -func (a *Allocation) Scale(serverName string) (alloc *Allocation, inc int) { - var ( - acc *Accelerator - server *Server - load *config.ServerLoadSpec - ) - - // get server info - if server = GetServer(serverName); server == nil { - return nil, 0 - } - if load = server.Load(); load == nil { - return nil, 0 - } - - // get accelerator info - gName := a.accelerator - if acc = GetAccelerator(gName); acc == nil { - return nil, 0 - } - - // create new allocation - alloc = CreateAllocation(serverName, gName) - inc = alloc.numReplicas - a.numReplicas - return alloc, inc -} - -func (a *Allocation) ReAllocate(serverName string) (*Allocation, string) { - minVal := float32(0) - var minAlloc *Allocation - for gName := range GetAccelerators() { - if alloc := CreateAllocation(serverName, gName); alloc != nil { - if minVal == 0 || alloc.value < minVal { - minVal = alloc.value - minAlloc = alloc - } - } - } - if minAlloc == nil { - return nil, "" - } - return minAlloc, minAlloc.accelerator -} - -func (a *Allocation) Accelerator() string { - return a.accelerator -} - -func (a *Allocation) NumReplicas() int { - return a.numReplicas -} - -func (a *Allocation) MaxArrvRatePerReplica() float32 { - return a.maxArrvRatePerReplica -} - -// Set the value for this allocation (may depend on cost, performance, ...) -func (a *Allocation) SetValue(value float32) { - a.value = value -} - -func (a *Allocation) Value() float32 { - return a.value -} - -// Calculate penalty for transitioning from this allocation (a) to another allocation (b) -func (a *Allocation) TransitionPenalty(b *Allocation) float32 { - if a.accelerator == b.accelerator { - if a.numReplicas == b.numReplicas { - return 0 - } else { - return b.cost - a.cost - } - } - return config.AccelPenaltyFactor*(a.cost+b.cost) + (b.cost - a.cost) -} - -func (a *Allocation) Clone() *Allocation { - return &Allocation{ - accelerator: a.accelerator, - numReplicas: a.numReplicas, - batchSize: a.batchSize, - cost: a.cost, - value: a.value, - servTime: a.servTime, - waitTime: a.waitTime, - rho: a.rho, - - maxArrvRatePerReplica: a.maxArrvRatePerReplica, - } -} - -func (a *Allocation) AllocationData() *config.AllocationData { - return &config.AllocationData{ - Accelerator: a.accelerator, - NumReplicas: a.numReplicas, - MaxBatch: a.batchSize, - Cost: a.cost, - ITLAverage: a.servTime, - WaitAverage: a.waitTime, - } -} - -func AllocationFromData(data *config.AllocationData) *Allocation { - return &Allocation{ - accelerator: data.Accelerator, - numReplicas: data.NumReplicas, - batchSize: data.MaxBatch, - cost: data.Cost, - servTime: data.ITLAverage, - waitTime: data.WaitAverage, - } -} - -func (a *Allocation) String() string { - return fmt.Sprintf("{acc=%s; num=%d; maxBatch=%d; cost=%v, val=%v, servTime=%v, waitTime=%v, rho=%v}", - a.accelerator, a.numReplicas, a.batchSize, a.cost, a.value, a.servTime, a.waitTime, a.rho) -} - -// Orchestration difference between two allocations -type AllocationDiff struct { - oldAccelerator string - newAccelerator string - oldNumReplicas int - newNumReplicas int - costDiff float32 -} - -func CreateAllocationDiff(a *Allocation, b *Allocation) *AllocationDiff { - if a == nil && b == nil { - return nil - } - oldAccelerator := "none" - newAccelerator := "none" - oldNumReplicas := 0 - newNumReplicas := 0 - oldCost := float32(0) - newCost := float32(0) - if a != nil { - oldAccelerator = a.accelerator - oldNumReplicas = a.numReplicas - oldCost = a.cost - } - if b != nil { - newAccelerator = b.accelerator - newNumReplicas = b.numReplicas - newCost = b.cost - } - return &AllocationDiff{ - oldAccelerator: oldAccelerator, - newAccelerator: newAccelerator, - oldNumReplicas: oldNumReplicas, - newNumReplicas: newNumReplicas, - costDiff: newCost - oldCost, - } -} - -func (d *AllocationDiff) String() string { - var b bytes.Buffer - fmt.Fprintf(&b, "{ %s -> %s, %d -> %d, %v }", - d.oldAccelerator, d.newAccelerator, d.oldNumReplicas, d.newNumReplicas, d.costDiff) - return b.String() -} diff --git a/pkg/core/model.go b/pkg/core/model.go deleted file mode 100644 index dc7670bd0..000000000 --- a/pkg/core/model.go +++ /dev/null @@ -1,75 +0,0 @@ -package core - -import ( - "fmt" - - "github.com/llm-inferno/inferno/pkg/config" -) - -// An inference model -type Model struct { - name string - - // model performance data for specified accelerators - perfData map[string]*config.ModelAcceleratorPerfData - - // number of accelerator instances needed to fit a model on a given accelerator - numInstances map[string]int -} - -func NewModel(name string) *Model { - return &Model{ - name: name, - perfData: make(map[string]*config.ModelAcceleratorPerfData), - numInstances: make(map[string]int), - } -} - -// Calculate basic parameters -func (m *Model) Calculate(accelerators map[string]*Accelerator) { - // add any operations here -} - -func (m *Model) Name() string { - return m.name -} - -func (m *Model) NumInstances(acceleratorName string) int { - return m.numInstances[acceleratorName] -} - -func (m *Model) PerfData(acceleratorName string) *config.ModelAcceleratorPerfData { - return m.perfData[acceleratorName] -} - -func (m *Model) AddPerfDataFromSpec(spec *config.ModelAcceleratorPerfData) { - if spec.Name == m.name { - m.perfData[spec.Acc] = spec - var count int - if count = spec.AccCount; count <= 0 { - count = 1 - } - m.numInstances[spec.Acc] = count - } -} - -func (m *Model) RemovePerfData(accName string) { - delete(m.perfData, accName) -} - -func (m *Model) Spec() *config.ModelData { - md := &config.ModelData{ - PerfData: make([]config.ModelAcceleratorPerfData, len(m.perfData)), - } - i := 0 - for _, pd := range m.perfData { - md.PerfData[i] = *pd - i++ - } - return md -} - -func (m *Model) String() string { - return fmt.Sprintf("Model: name=%s; numInstances=%v", - m.name, m.numInstances) -} diff --git a/pkg/core/server.go b/pkg/core/server.go deleted file mode 100644 index b06b97c8f..000000000 --- a/pkg/core/server.go +++ /dev/null @@ -1,135 +0,0 @@ -package core - -import ( - "fmt" - - "github.com/llm-inferno/inferno/pkg/config" -) - -// A server for a service class and model -type Server struct { - name string - serviceClassName string - modelName string - - // server load statistics - load *config.ServerLoadSpec - - // for all accelerators - allAllocations map[string]*Allocation - - // allocated solution - allocation *Allocation - - // current allocation - curAllocation *Allocation - - spec *config.ServerSpec -} - -func NewServerFromSpec(spec *config.ServerSpec) *Server { - ld := spec.CurrentAlloc.Load - svcName := spec.Class - if svcName == "" { - svcName = config.DefaultServiceClassName - } - return &Server{ - name: spec.Name, - serviceClassName: svcName, - modelName: spec.Model, - load: &ld, - allAllocations: map[string]*Allocation{}, - curAllocation: AllocationFromData(&spec.CurrentAlloc), - spec: spec, - } -} - -// Calculate allocations for all accelerators -func (s *Server) Calculate(accelerators map[string]*Accelerator) { - s.allAllocations = make(map[string]*Allocation) - for _, g := range accelerators { - if alloc := CreateAllocation(s.name, g.Name()); alloc != nil { - if s.curAllocation != nil { - penalty := s.curAllocation.TransitionPenalty(alloc) - alloc.SetValue(penalty) - } - s.allAllocations[g.Name()] = alloc - } - } -} - -func (s *Server) Name() string { - return s.name -} - -func (s *Server) ServiceClassName() string { - return s.serviceClassName -} - -func (s *Server) Priority() int { - if svc := GetServiceClass(s.serviceClassName); svc != nil { - return svc.Priority() - } - return config.DefaultServiceClassPriority -} - -func (s *Server) ModelName() string { - return s.modelName -} - -func (s *Server) Load() *config.ServerLoadSpec { - return s.load -} - -func (s *Server) SetLoad(load *config.ServerLoadSpec) { - s.load = load -} - -func (s *Server) Allocation() *Allocation { - return s.allocation -} - -func (s *Server) SetAllocation(alloc *Allocation) { - s.allocation = alloc - s.UpdateDesiredAlloc() -} - -func (s *Server) RemoveAllocation() { - s.allocation = nil -} - -func (s *Server) CurAllocation() *Allocation { - return s.curAllocation -} - -func (s *Server) SetCurAllocation(curAllocation *Allocation) { - s.curAllocation = curAllocation -} - -func (s *Server) AllAllocations() map[string]*Allocation { - return s.allAllocations -} - -func (s *Server) Spec() *config.ServerSpec { - return s.spec -} - -func (s *Server) UpdateDesiredAlloc() { - if s.allocation != nil { - s.spec.DesiredAlloc = *s.allocation.AllocationData() - s.spec.DesiredAlloc.Load = *s.load - } else { - s.spec.DesiredAlloc = config.AllocationData{} - } -} - -func (s *Server) ApplyDesiredAlloc() { - s.spec.CurrentAlloc = s.spec.DesiredAlloc - s.curAllocation = AllocationFromData(&s.spec.CurrentAlloc) - s.load = &s.spec.CurrentAlloc.Load -} - -func (s *Server) String() string { - return fmt.Sprintf("Server: name=%s; class=%s; model=%s; load=%v; allocation=%v", - s.name, s.serviceClassName, s.modelName, s.load, s.allocation) -} diff --git a/pkg/core/serviceclass.go b/pkg/core/serviceclass.go deleted file mode 100644 index 65babb3e6..000000000 --- a/pkg/core/serviceclass.go +++ /dev/null @@ -1,86 +0,0 @@ -package core - -import ( - "fmt" - - "github.com/llm-inferno/inferno/pkg/config" -) - -// A service class -type ServiceClass struct { - name string // unique name - priority int // non-negative priority (smaller values for higher priority) - targets map[string]*Target // target SLOs for each model -} - -// target SLOs for service class -type Target struct { - ITL float32 - TTW float32 - TPS float32 -} - -func (t *Target) String() string { - return fmt.Sprintf("[ITL=%v, TTW=%v, TPS=%v]", - t.ITL, t.TTW, t.TPS) -} - -func NewServiceClass(name string, priority int) *ServiceClass { - if priority < 0 { - priority = config.DefaultServiceClassPriority - } - return &ServiceClass{ - name: name, - priority: priority, - targets: map[string]*Target{}, - } -} - -// set target SLOs for a model in a service class (replace if already exists) -func (c *ServiceClass) SetTargetFromSpec(spec *config.ServiceClassSpec) { - if spec.Name == c.name { - c.targets[spec.Model] = &Target{ - ITL: spec.SLO_ITL, - TTW: spec.SLO_TTW, - TPS: spec.SLO_TPS, - } - } -} - -func (c *ServiceClass) Name() string { - return c.name -} - -func (c *ServiceClass) Priority() int { - return c.priority -} - -func (c *ServiceClass) ModelTarget(modelName string) *Target { - return c.targets[modelName] -} - -func (c *ServiceClass) RemoveModelTarget(modelName string) { - delete(c.targets, modelName) -} - -func (c *ServiceClass) Spec() []config.ServiceClassSpec { - specs := make([]config.ServiceClassSpec, len(c.targets)) - i := 0 - for modelName, target := range c.targets { - specs[i] = config.ServiceClassSpec{ - Name: c.name, - Priority: c.priority, - Model: modelName, - SLO_ITL: target.ITL, - SLO_TTW: target.TTW, - SLO_TPS: target.TPS, - } - i++ - } - return specs -} - -func (c *ServiceClass) String() string { - return fmt.Sprintf("ServiceClass: name=%s; priority=%d; targets=%v", - c.name, c.priority, c.targets) -} diff --git a/pkg/core/system.go b/pkg/core/system.go deleted file mode 100644 index d976bd58e..000000000 --- a/pkg/core/system.go +++ /dev/null @@ -1,385 +0,0 @@ -package core - -import ( - "bytes" - "fmt" - - "github.com/llm-inferno/inferno/pkg/config" -) - -var ( - // a static reference to the singleton system object - TheSystem *System -) - -func GetAccelerator(name string) *Accelerator { - return TheSystem.Accelerator(name) -} - -func GetModel(name string) *Model { - return TheSystem.Model(name) -} - -func GetServiceClass(name string) *ServiceClass { - return TheSystem.ServiceClass(name) -} - -func GetServer(name string) *Server { - return TheSystem.Server(name) -} - -func GetAccelerators() map[string]*Accelerator { - return TheSystem.accelerators -} - -func GetModels() map[string]*Model { - return TheSystem.models -} - -func GetServers() map[string]*Server { - return TheSystem.servers -} - -func GetCapacities() map[string]int { - return TheSystem.capacity -} - -// System comprising all accelerators, models, service classes, and servers -type System struct { - accelerators map[string]*Accelerator - models map[string]*Model - serviceClasses map[string]*ServiceClass - servers map[string]*Server - - capacity map[string]int // available count of accelerator types - allocationByType map[string]*AllocationByType // number of allocated accelerator types - allocationSolution *config.AllocationSolution -} - -// Allocation data about an accelerator type -type AllocationByType struct { - name string // name of accelerator type - count int // total number of this type - limit int // maximum number of this type - cost float32 // total cost of this type -} - -// Create a new system -func NewSystem() *System { - return &System{ - accelerators: make(map[string]*Accelerator), - models: make(map[string]*Model), - serviceClasses: make(map[string]*ServiceClass), - servers: make(map[string]*Server), - - capacity: make(map[string]int), - allocationByType: make(map[string]*AllocationByType), - allocationSolution: nil, - } -} - -// Set system from spec -func (s *System) SetFromSpec(d *config.SystemSpec) *config.OptimizerSpec { - s.SetAcceleratorsFromSpec(&d.Accelerators) - s.SetModelsFromSpec(&d.Models) - s.SetServiceClassesFromSpec(&d.ServiceClasses) - s.SetServersFromSpec(&d.Servers) - s.SetCapacityFromSpec(&d.Capacity) - return &d.Optimizer.Spec -} - -// Set accelerators from spec -func (s *System) SetAcceleratorsFromSpec(d *config.AcceleratorData) { - for _, v := range d.Spec { - s.AddAcceleratorFromSpec(v) - } -} - -// Add an accelerator (replace if already exists) -func (s *System) AddAcceleratorFromSpec(spec config.AcceleratorSpec) { - s.accelerators[spec.Name] = NewAcceleratorFromSpec(&spec) -} - -// Remove an accelerator -func (s *System) RemoveAccelerator(name string) error { - if s.accelerators[name] == nil { - return fmt.Errorf("accelerator %s not found", name) - } - delete(s.accelerators, name) - return nil -} - -// Set capacity count from spec -func (s *System) SetCapacityFromSpec(d *config.CapacityData) { - for _, v := range d.Count { - s.SetCountFromSpec(v) - } -} - -// Set capacity count for an accelerator type -func (s *System) SetCountFromSpec(spec config.AcceleratorCount) { - s.capacity[spec.Type] = spec.Count -} - -// Set models from spec -func (s *System) SetModelsFromSpec(d *config.ModelData) { - for _, pd := range d.PerfData { - modelName := pd.Name - var model *Model - if model = s.models[modelName]; model == nil { - model = s.AddModel(modelName) - } - model.AddPerfDataFromSpec(&pd) - } -} - -// Add a model (replace if already exists) -func (s *System) AddModel(name string) *Model { - model := NewModel(name) - s.models[name] = model - return model -} - -// Remove a model -func (s *System) RemoveModel(name string) error { - if s.models[name] == nil { - return fmt.Errorf("model %s not found", name) - } - delete(s.models, name) - return nil -} - -// Set servers from spec -func (s *System) SetServersFromSpec(d *config.ServerData) { - for _, v := range d.Spec { - s.servers[v.Name] = NewServerFromSpec(&v) - } -} - -// Add a server (replace if already exists) -func (s *System) AddServerFromSpec(spec config.ServerSpec) { - s.servers[spec.Name] = NewServerFromSpec(&spec) -} - -// Remove a server -func (s *System) RemoveServer(name string) error { - if s.servers[name] == nil { - return fmt.Errorf("server %s not found", name) - } - delete(s.servers, name) - return nil -} - -// Set service classes from spec -func (s *System) SetServiceClassesFromSpec(d *config.ServiceClassData) { - for _, t := range d.Spec { - name := t.Name - if _, exists := s.serviceClasses[name]; !exists { - s.serviceClasses[name] = NewServiceClass(name, t.Priority) - } - svc := s.serviceClasses[name] - svc.SetTargetFromSpec(&t) - } -} - -// Add a service class (replace if already exists) -func (s *System) AddServiceClass(name string, priority int) { - s.serviceClasses[name] = NewServiceClass(name, priority) -} - -// Remove a service class -func (s *System) RemoveServiceClass(name string) error { - if s.serviceClasses[name] == nil { - return fmt.Errorf("service class %s not found", name) - } - delete(s.serviceClasses, name) - return nil -} - -// Get all accelerators -func (s *System) Accelerators() map[string]*Accelerator { - return s.accelerators -} - -// Get all models -func (s *System) Models() map[string]*Model { - return s.models -} - -// Get all service classes -func (s *System) ServiceClasses() map[string]*ServiceClass { - return s.serviceClasses -} - -// Get all servers -func (s *System) Servers() map[string]*Server { - return s.servers -} - -// Get accelerator object for a given accelerator name; nil if doesn't exist -func (s *System) Accelerator(name string) *Accelerator { - return s.accelerators[name] -} - -// Get model object for a given model name; nil if doesn't exist -func (s *System) Model(name string) *Model { - return s.models[name] -} - -// Get service class object for a given service class name; nil if doesn't exist -func (s *System) ServiceClass(name string) *ServiceClass { - return s.serviceClasses[name] -} - -// Get server object for a given server name; nil if doesn't exist -func (s *System) Server(name string) *Server { - return s.servers[name] -} - -// Get capacities of accelerator types -func (s *System) Capacities() map[string]int { - return s.capacity -} - -// Get capacity of an accelerator type -func (s *System) Capacity(name string) (int, bool) { - if cap, exists := s.capacity[name]; !exists { - return 0, false - } else { - return cap, true - } -} - -// Remove capacity of an accelerator type -func (s *System) RemoveCapacity(name string) bool { - if _, exists := s.capacity[name]; !exists { - return false - } - delete(s.capacity, name) - return true -} - -// Calculate basic parameters -func (s *System) Calculate() { - for _, g := range s.accelerators { - g.Calculate() - } - for _, m := range s.models { - m.Calculate(s.accelerators) - } - for _, v := range s.servers { - v.Calculate(s.accelerators) - } -} - -// Accumulate allocation data by accelerator type -func (s *System) AllocateByType() { - s.allocationByType = map[string]*AllocationByType{} - for _, server := range s.Servers() { - modelName := server.ModelName() - serverAlloc := server.Allocation() - if serverAlloc == nil { - continue - } - accName := serverAlloc.accelerator - acc := s.accelerators[accName] - model := s.Model(modelName) - if acc == nil || model == nil { - continue - } - nameType := acc.Type() - var alloc *AllocationByType - var exists bool - if alloc, exists = s.allocationByType[nameType]; !exists { - alloc = &AllocationByType{ - name: nameType, - count: 0, - limit: s.capacity[nameType], - cost: 0, - } - } - alloc.count += serverAlloc.numReplicas * model.numInstances[accName] * acc.Multiplicity() - alloc.cost += serverAlloc.cost - s.allocationByType[nameType] = alloc - } -} - -// generate json allocation solution for all servers in the system -func (s *System) GenerateSolution() *config.AllocationSolution { - allocationSolution := config.AllocationSolution{ - Spec: make(map[string]config.AllocationData), - } - for serverName, server := range s.servers { - serverAlloc := server.Allocation() - if serverAlloc == nil { - continue - } - load := server.Load() - allocData := serverAlloc.AllocationData() - allocData.Load = *load - allocationSolution.Spec[serverName] = *allocData - } - s.allocationSolution = &allocationSolution - return &allocationSolution -} - -func (a *AllocationByType) String() string { - var b bytes.Buffer - fmt.Fprintf(&b, "name=%s, count=%d, limit=%d, cost=%v", a.name, a.count, a.limit, a.cost) - return b.String() -} - -func (s *System) String() string { - var b bytes.Buffer - // b.WriteString("Accelerators: \n") - // for _, g := range s.accelerators { - // fmt.Fprintln(&b, g) - // } - // fmt.Fprintf(&b, "capacity=%v \n", s.capacity) - // b.WriteString("Models: \n") - // for _, m := range s.models { - // fmt.Fprintln(&b, m) - // } - // b.WriteString("ServiceClasses: \n") - // for _, c := range s.serviceClasses { - // fmt.Fprintln(&b, c) - // } - // b.WriteString("Servers: \n") - // for _, s := range s.servers { - // fmt.Fprintln(&b, s) - // } - - b.WriteString("Solution: \n") - totalCost := float32(0) - for serverName, server := range s.Servers() { - srvClassName := server.ServiceClassName() - modelName := server.ModelName() - load := server.Load() - svc := s.serviceClasses[srvClassName] - if load == nil || svc == nil { - continue - } - target := svc.ModelTarget(modelName) - if target == nil { - continue - } - alloc := server.Allocation() - if alloc == nil { - fmt.Fprintf(&b, "s=%s; c=%s; m=%s; no feasible allocation! \n", serverName, srvClassName, modelName) - continue - } - totalCost += alloc.cost - rate := load.ArrivalRate - tokens := load.AvgLength - fmt.Fprintf(&b, "c=%s; m=%s; rate=%v; tk=%d; sol=%d, alloc=%v; ", srvClassName, modelName, rate, tokens, len(server.allAllocations), alloc) - fmt.Fprintf(&b, "slo-itl=%v, slo-ttw=%v, slo-tps=%v \n", target.ITL, target.TTW, target.TPS) - } - - b.WriteString("AllocationByType: \n") - for _, a := range s.allocationByType { - fmt.Fprintf(&b, "%v \n", a) - } - fmt.Fprintf(&b, "totalCost=%v \n", totalCost) - - return b.String() -} diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go deleted file mode 100644 index 312d8741a..000000000 --- a/pkg/manager/manager.go +++ /dev/null @@ -1,27 +0,0 @@ -package manager - -import ( - "github.com/llm-inferno/inferno/pkg/core" - "github.com/llm-inferno/inferno/pkg/solver" -) - -type Manager struct { - system *core.System - optimizer *solver.Optimizer -} - -func NewManager(system *core.System, optimizer *solver.Optimizer) *Manager { - core.TheSystem = system - return &Manager{ - system: system, - optimizer: optimizer, - } -} - -func (m *Manager) Optimize() error { - if err := m.optimizer.Optimize(); err != nil { - return err - } - m.system.AllocateByType() - return nil -} diff --git a/pkg/solver/milpsolver.go b/pkg/solver/milpsolver.go deleted file mode 100644 index fa0c75d20..000000000 --- a/pkg/solver/milpsolver.go +++ /dev/null @@ -1,284 +0,0 @@ -package solver - -import ( - "fmt" - - "github.com/llm-inferno/inferno/pkg/config" - "github.com/llm-inferno/inferno/pkg/core" - lpsolveConfig "github.com/llm-inferno/lpsolve/pkg/config" - lpsolve "github.com/llm-inferno/lpsolve/pkg/core" - lpsolveUtils "github.com/llm-inferno/lpsolve/pkg/utils" -) - -type MILPSolver struct { - optimizerSpec *config.OptimizerSpec - - numServers int // number of servers (a pair of service class and model) - numAccelerators int // number of accelerators - instanceCost []float64 // [numAccelerators] - numInstancesPerReplica [][]int // [sumServers][numAccelerators] - ratePerReplica [][]float64 // [sumServers][numAccelerators] - arrivalRates []float64 // [sumServers] - - numAcceleratorTypes int // number of accelerator types - unitsAvail []int // [numAcceleratorTypes] - acceleratorTypesMatrix [][]int // [numAcceleratorTypes][numAccelerators] - - numReplicas [][]int // resulting number of replicas [numServers][numAccelerators] - instancesUsed []int // number of used accelerator units [numAccelerators] - unitsUsed []int // [numAcceleratorTypes] - - accIndex map[string]int // acceleratorName -> index in accelerator arrays - accLookup []string // index -> acceleratorName - serverIndex map[string]int // serverName -> index in server arrays - serverLookup []string // index -> serverName - accTypeIndex map[string]int // acceleratorTypeName -> index in acceleratorType arrays - accTypeLookup []string // index -> acceleratorTypeName -} - -func NewMILPSolver(optimizerSpec *config.OptimizerSpec) *MILPSolver { - return &MILPSolver{ - optimizerSpec: optimizerSpec, - } -} - -func (v *MILPSolver) Solve() error { - v.preProcess() - - isLimited := !v.optimizerSpec.Unlimited - isMulti := v.optimizerSpec.Heterogeneous - useCplex := v.optimizerSpec.UseCplex - if err := v.optimize(isLimited, isMulti, useCplex); err != nil { - return err - } - - v.postProcess() - return nil -} - -// prepare input date for MILP solver -func (v *MILPSolver) preProcess() { - - // create map and lookup arrays for accelerators - accMap := core.GetAccelerators() - v.numAccelerators = len(accMap) - v.accIndex = make(map[string]int) - v.accLookup = make([]string, v.numAccelerators) - - // set cost values - v.instanceCost = make([]float64, v.numAccelerators) - index := 0 - for accName, acc := range accMap { - v.accIndex[accName] = index - v.accLookup[index] = accName - v.instanceCost[index] = float64(acc.Spec().Cost) - index++ - } - - // fmt.Println(v.accIndex) - // fmt.Println(v.accLookup) - // fmt.Println(lpsolveUtils.Pretty1D("unitCost", v.instanceCost)) - - // create map and lookup arrays for accelerator types - capMap := core.GetCapacities() - v.numAcceleratorTypes = len(capMap) - v.accTypeIndex = make(map[string]int) - v.accTypeLookup = make([]string, v.numAcceleratorTypes) - - // set available accelerator types - v.unitsAvail = make([]int, v.numAcceleratorTypes) - v.acceleratorTypesMatrix = make([][]int, v.numAcceleratorTypes) - index = 0 - for accTypeName, accTypeCount := range capMap { - v.accTypeIndex[accTypeName] = index - v.accTypeLookup[index] = accTypeName - v.unitsAvail[index] = accTypeCount - v.acceleratorTypesMatrix[index] = make([]int, v.numAccelerators) - index++ - } - - // set matrix of accelerator types to accelerators - for accName, acc := range accMap { - accType := acc.Type() - if accIndex, exists := v.accIndex[accName]; exists { - accTypeIndex := v.accTypeIndex[accType] - v.acceleratorTypesMatrix[accTypeIndex][accIndex] = acc.Spec().Multiplicity - } - } - - // fmt.Println(v.accTypeIndex) - // fmt.Println(v.accTypeLookup) - // fmt.Println(lpsolveUtils.Pretty1D("unitsAvailByType", v.unitsAvail)) - // fmt.Println(lpsolveUtils.Pretty2D("acceleratorTypesMatrix", v.acceleratorTypesMatrix)) - - // create map and lookup arrays for servers - index = 0 - v.serverIndex = make(map[string]int) - srvMap := core.GetServers() - for srvName := range srvMap { - v.serverIndex[srvName] = index - index++ - } - v.numServers = index - v.serverLookup = make([]string, v.numServers) - for srvName, index := range v.serverIndex { - v.serverLookup[index] = srvName - } - - // set values for arrival rates and per replica arrivals and number of instances - v.arrivalRates = make([]float64, v.numServers) - v.numInstancesPerReplica = make([][]int, v.numServers) - v.ratePerReplica = make([][]float64, v.numServers) - for i := 0; i < v.numServers; i++ { - v.numInstancesPerReplica[i] = make([]int, v.numAccelerators) - v.ratePerReplica[i] = make([]float64, v.numAccelerators) - } - modelMap := core.GetModels() - for srvName, srv := range srvMap { - if i, exists := v.serverIndex[srvName]; exists { - load := srv.Load() - if load == nil { - continue - } - v.arrivalRates[i] = float64(load.ArrivalRate / 60 / 1000) - m := modelMap[srv.ModelName()] - for accName, j := range v.accIndex { - //acc := accMap[accName] - v.numInstancesPerReplica[i][j] = m.NumInstances(accName) - if alloc := srv.AllAllocations()[accName]; alloc != nil { - v.ratePerReplica[i][j] = float64(alloc.MaxArrvRatePerReplica()) - } - } - } - - } - - // fmt.Println(v.serverIndex) - // fmt.Println(lpsolveUtils.Pretty1D("arrivalRates", v.arrivalRates)) - // fmt.Println(lpsolveUtils.Pretty2D("ratePerReplica", v.ratePerReplica)) - // fmt.Println(lpsolveUtils.Pretty2D("numInstancesPerReplica", v.numInstancesPerReplica)) -} - -// call MILP solver to optimize problem -func (v *MILPSolver) optimize(isLimited bool, isMulti bool, useCplex bool) error { - problemType := lpsolveConfig.SINGLE - if isMulti { - problemType = lpsolveConfig.MULTI - } - p, err := v.createProblem(problemType, isLimited, useCplex) - if err != nil { - return err - } - if err := p.Solve(); err != nil { - return err - } - v.printResults(problemType, p) - return nil -} - -func (v *MILPSolver) createProblem(problemType lpsolveConfig.ProblemType, isLimited bool, useCplex bool) (lpsolve.Problem, error) { - // create a new problem instance - var p lpsolve.Problem - var err error - switch problemType { - case lpsolveConfig.SINGLE: - if useCplex { - p, err = lpsolve.CreateCplexProblem(v.numServers, v.numAccelerators, v.instanceCost, v.numInstancesPerReplica, - v.ratePerReplica, v.arrivalRates) - } else { - p, err = lpsolve.CreateSingleAssignProblem(v.numServers, v.numAccelerators, v.instanceCost, v.numInstancesPerReplica, - v.ratePerReplica, v.arrivalRates) - } - case lpsolveConfig.MULTI: - if useCplex { - p, err = lpsolve.CreateCplexProblem(v.numServers, v.numAccelerators, v.instanceCost, v.numInstancesPerReplica, - v.ratePerReplica, v.arrivalRates) - } else { - p, err = lpsolve.CreateMultiAssignProblem(v.numServers, v.numAccelerators, v.instanceCost, v.numInstancesPerReplica, - v.ratePerReplica, v.arrivalRates) - } - default: - return nil, fmt.Errorf("unknown problem type: %s", problemType) - } - if err != nil { - return nil, err - } - - // set accelerator count limited option - if isLimited { - if err = p.SetLimited(v.numAcceleratorTypes, v.unitsAvail, v.acceleratorTypesMatrix); err != nil { - return nil, err - } - if useCplex { - switch problemType { - case lpsolveConfig.SINGLE: - SetFileNames(p, "single-limited") - case lpsolveConfig.MULTI: - SetFileNames(p, "multi-limited") - } - } - } else { - p.UnSetLimited() - if useCplex { - switch problemType { - case lpsolveConfig.SINGLE: - SetFileNames(p, "single-unlimited") - case lpsolveConfig.MULTI: - SetFileNames(p, "multi-unlimited") - } - } - } - return p, nil -} - -func SetFileNames(p lpsolve.Problem, name string) { - pc := p.(*lpsolve.CplexProblem) - pc.SetModelFileName(name + ".mod") - pc.SetDataFileName(name + ".dat") - pc.SetOutputFileName(name + ".txt") -} - -// print solution details -func (v *MILPSolver) printResults(problemType lpsolveConfig.ProblemType, p lpsolve.Problem) { - fmt.Printf("Problem type: %v\n", problemType) - fmt.Printf("Solution type: %v\n", p.GetSolutionType()) - fmt.Printf("Solution time: %d msec\n", p.GetSolutionTimeMsec()) - fmt.Printf("Objective value: %v\n", p.GetObjectiveValue()) - - fmt.Println() - fmt.Printf("Accelerators=%v \n", v.accLookup) - fmt.Printf("Servers=%v \n", v.serverLookup) - fmt.Println() - - v.numReplicas = p.GetNumReplicas() - fmt.Println(lpsolveUtils.Pretty2D("numReplicas", v.numReplicas)) - - v.instancesUsed = p.GetInstancesUsed() - fmt.Println(lpsolveUtils.Pretty1D("instancesUsed", v.instancesUsed)) - - if p.IsLimited() { - fmt.Printf("AcceleratorTypes=%v \n", v.accTypeLookup) - fmt.Println(lpsolveUtils.Pretty1D("unitsAvail", v.unitsAvail)) - v.unitsUsed = p.GetUnitsUsed() - fmt.Println(lpsolveUtils.Pretty1D("unitsUsed", v.unitsUsed)) - } - fmt.Println() -} - -// process output data from MILP solver -func (v *MILPSolver) postProcess() { - for i := 0; i < v.numServers; i++ { - for j := 0; j < v.numAccelerators; j++ { - n := v.numReplicas[i][j] - if n == 0 { - continue - } - accName := v.accLookup[j] - sc := core.GetServer(v.serverLookup[i]) - // TODO: Fix this - if alloc := sc.AllAllocations()[accName]; alloc != nil { - sc.SetAllocation(alloc) - } - } - } -} diff --git a/pkg/solver/optimizer.go b/pkg/solver/optimizer.go deleted file mode 100644 index 8d3255ff8..000000000 --- a/pkg/solver/optimizer.go +++ /dev/null @@ -1,48 +0,0 @@ -package solver - -import ( - "bytes" - "fmt" - "time" - - "github.com/llm-inferno/inferno/pkg/config" -) - -type Optimizer struct { - spec *config.OptimizerSpec - solver *Solver - solutionTimeMsec int64 -} - -// Create optimizer from spec -func NewOptimizerFromSpec(spec *config.OptimizerSpec) *Optimizer { - return &Optimizer{ - spec: spec, - } -} - -func (o *Optimizer) Optimize() error { - if o.spec == nil { - return fmt.Errorf("missing optimizer spec") - } - o.solver = NewSolver(o.spec) - - startTime := time.Now() - err := o.solver.Solve() - endTime := time.Now() - o.solutionTimeMsec = endTime.Sub(startTime).Milliseconds() - return err -} - -func (o *Optimizer) SolutionTimeMsec() int64 { - return o.solutionTimeMsec -} - -func (o *Optimizer) String() string { - var b bytes.Buffer - if o.solver != nil { - b.WriteString(o.solver.String()) - } - fmt.Fprintf(&b, "Solution time: %d msec\n", o.solutionTimeMsec) - return b.String() -} diff --git a/pkg/solver/solver.go b/pkg/solver/solver.go deleted file mode 100644 index 55283ebe8..000000000 --- a/pkg/solver/solver.go +++ /dev/null @@ -1,233 +0,0 @@ -package solver - -import ( - "bytes" - "cmp" - "fmt" - "math" - "slices" - - "github.com/llm-inferno/inferno/pkg/config" - "github.com/llm-inferno/inferno/pkg/core" -) - -// Solver of allocation assignment problem -type Solver struct { - optimizerSpec *config.OptimizerSpec - - // current allocation for all servers - currentAllocation map[string]*core.Allocation - - // difference in allocation for all servers - diffAllocation map[string]*core.AllocationDiff -} - -func NewSolver(optimizerSpec *config.OptimizerSpec) *Solver { - return &Solver{ - optimizerSpec: optimizerSpec, - currentAllocation: make(map[string]*core.Allocation), - diffAllocation: make(map[string]*core.AllocationDiff), - } -} - -// Entry in the solution space for a service class and model pair -type entry struct { - serverName string // server name - priority int // priority of service class for server - curIndex int // current index in allocation list - allocations []*core.Allocation // ordered list of allocations - delta float32 // delta penalty if current allocation not allowed and next allocation is allowed -} - -func (e *entry) String() string { - var b bytes.Buffer - fmt.Fprintf(&b, "sName=%s, prio=%d, curIndex=%d, delta=%v, allocations=%v \n", - e.serverName, e.priority, e.curIndex, e.delta, e.allocations) - return b.String() -} - -// Find optimal allocation for all service classes -func (s *Solver) Solve() error { - // take snapshot of current allocations - s.currentAllocation = make(map[string]*core.Allocation) - for serverName, server := range core.GetServers() { - if alloc := server.CurAllocation(); alloc != nil { - s.currentAllocation[serverName] = alloc - } - } - - // find solution - if s.optimizerSpec.MILPSolver { - if err := s.SolveMILP(); err != nil { - return err - } - } else if s.optimizerSpec.Unlimited { - s.SolveUnlimited() - } else { - s.SolveLimited() - } - // calculate difference - - // TODO: cleanup after trying MIP solver - - s.diffAllocation = make(map[string]*core.AllocationDiff) - for serverName, server := range core.GetServers() { - curAlloc := s.currentAllocation[serverName] - desiredAlloc := server.Allocation() - if allocDiff := core.CreateAllocationDiff(curAlloc, desiredAlloc); allocDiff != nil { - s.diffAllocation[serverName] = allocDiff - } - } - return nil -} - -// Find optimal allocations assuming unlimited accelerator capacity -func (s *Solver) SolveUnlimited() { - for _, server := range core.GetServers() { - server.RemoveAllocation() - // select allocation with minimum value - minVal := float32(math.MaxFloat32) - var minAlloc *core.Allocation - for _, alloc := range server.AllAllocations() { - if alloc.Value() < minVal { - minVal = alloc.Value() - minAlloc = alloc - } - } - if minAlloc != nil { - server.SetAllocation(minAlloc) - } - } -} - -// Find optimal allocations assuming limited accelerator capacity -func (s *Solver) SolveLimited() { - // calculate available count of accelerator types - available := make(map[string]int) - for k, v := range core.GetCapacities() { - available[k] = v - } - // for all servers, sort allocations - var entries []*entry = make([]*entry, 0) - for serverName, server := range core.GetServers() { - server.RemoveAllocation() - allAllocs := server.AllAllocations() - if len(allAllocs) == 0 { - continue - } - e := &entry{ - serverName: serverName, - priority: server.Priority(), - curIndex: 0, - allocations: make([]*core.Allocation, len(allAllocs)), - delta: 0, - } - i := 0 - for _, alloc := range allAllocs { - e.allocations[i] = alloc - i++ - } - slices.SortFunc(e.allocations, func(a, b *core.Allocation) int { - return cmp.Compare(a.Value(), b.Value()) - }) - if len(e.allocations) > 1 { - // value is difference between this and next allocation - e.delta = e.allocations[1].Value() - e.allocations[0].Value() - } else { - // last choice, large value for not assigning - e.delta = math.MaxFloat32 - } - entries = append(entries, e) - } - // sort all entries - orderFunc := func(a, b *entry) int { - aPrio := (1 + config.PriorityWeightFactor/float32(1+a.priority)) - bPrio := (1 + config.PriorityWeightFactor/float32(1+b.priority)) - - aDelta := a.delta * aPrio - bDelta := b.delta * bPrio - - if aDelta == bDelta { - aVal := a.allocations[a.curIndex].Value() * aPrio - bVal := b.allocations[b.curIndex].Value() * bPrio - return cmp.Compare(bVal, aVal) - } - return cmp.Compare(bDelta, aDelta) - } - - // straight priorities - // orderFunc := func(a, b *entry) int { - // if a.priority == b.priority { - // if a.delta == b.delta { - // return cmp.Compare(b.allocations[b.curIndex].Value(), a.allocations[a.curIndex].Value()) - // } - // return cmp.Compare(b.delta, a.delta) - // } else { - // return cmp.Compare(a.priority, b.priority) - // } - // } - - slices.SortFunc(entries, orderFunc) - // start assignment greedily - for len(entries) > 0 { - top := entries[0] - entries = entries[1:] - - if len(top.allocations) == 0 { - continue - } - - serverName := top.serverName - server := core.GetServer(serverName) - if server == nil { - continue - } - model := core.GetModel(server.ModelName()) - if model == nil { - continue - } - - alloc := top.allocations[top.curIndex] - gName := alloc.Accelerator() - replicas := alloc.NumReplicas() - acc := core.GetAccelerator(gName) - tName := acc.Type() - count := replicas * model.NumInstances(gName) * acc.Spec().Multiplicity - - if available[tName] >= count { - available[tName] -= count - server := core.GetServer(serverName) - server.SetAllocation(alloc) - } else { - top.curIndex++ - if top.curIndex+1 < len(top.allocations) { - top.delta = top.allocations[top.curIndex+1].Value() - top.allocations[top.curIndex].Value() - } else if top.curIndex == len(top.allocations) { - continue - } else { - top.delta = math.MaxFloat32 - } - i, _ := slices.BinarySearchFunc(entries, top, orderFunc) - entries = slices.Insert(entries, i, top) - } - } -} - -func (s *Solver) SolveMILP() error { - mip := NewMILPSolver(s.optimizerSpec) - return mip.Solve() -} - -func (s *Solver) AllocationDiff() map[string]*core.AllocationDiff { - return s.diffAllocation -} - -func (s *Solver) String() string { - var b bytes.Buffer - b.WriteString("Solver: \n") - for serverName, allocDiff := range s.diffAllocation { - fmt.Fprintf(&b, "sName=%s, allocDiff=%v \n", - serverName, allocDiff) - } - return b.String() -} diff --git a/pkg/utils/helpers.go b/pkg/utils/helpers.go deleted file mode 100644 index 4682b316c..000000000 --- a/pkg/utils/helpers.go +++ /dev/null @@ -1,12 +0,0 @@ -package utils - -import "encoding/json" - -// unmarshal a byte array to its corresponding object -func FromDataToSpec[T interface{}](byteValue []byte, t T) (*T, error) { - var d T - if err := json.Unmarshal(byteValue, &d); err != nil { - return nil, err - } - return &d, nil -} diff --git a/rest-server/README.md b/rest-server/README.md deleted file mode 100644 index 7f1d263ee..000000000 --- a/rest-server/README.md +++ /dev/null @@ -1,294 +0,0 @@ -# A REST API Server for the Optimizer - -The host name and port for the server are specified as environment variables `INFERNO_HOST` and `INFERNO_PORT`, respectively. If not set, the default server is at `localhost:8080`. - -## Data Format - -The following data is needed by the Optimizer (Declarations described [types](../pkg/config/types.go)). - -1. **Accelerator data**: For all accelerators, the specification, such as name, type, cost, and other attributes of an accelerator. An example follows. - - ```json - { - "accelerators": [ - { - "name": "A100", - "type": "A100", - "multiplicity": 1, - "power" : { - "idle": 150, - "full": 400, - "midPower": 320, - "midUtil": 0.6 - }, - "cost": 40.00 - }, - { - "name": "G2", - "type": "G2", - "multiplicity": 1, - "power" : { - "idle": 180, - "full": 600, - "midPower": 500, - "midUtil": 0.6 - }, - "cost": 25.00 - }, - { - "name": "4xA100", - "type": "A100", - "multiplicity": 4, - "power" : { - "idle": 600, - "full": 1600, - "midPower": 1280, - "midUtil": 0.6 - }, - "cost": 160.00 - } - ] - } - ``` - -1. **Capacity data**: For all accelerator types, a count of available units of that type. An example follows. - - ```json - { - "count": [ - { - "type": "G2", - "count": 256 - }, - { - "type": "A100", - "count": 128 - } - ] - } - ``` - -1. **Model data**: For all models, a collection of performance data for pairs of model and accelerators. An example follows. - - ```json - { - "models": [ - { - "name": "granite_13b", - "acc": "A100", - "accCount": 1, - "alpha": 20.58, - "beta": 0.41, - "maxBatchSize": 32, - "atTokens": 512 - }, - { - "name": "granite_13b", - "acc": "G2", - "accCount": 1, - "alpha": 17.15, - "beta": 0.34, - "maxBatchSize": 38, - "atTokens": 512 - }, - { - "name": "llama_70b", - "acc": "G2", - "accCount": 2, - "alpha": 22.84, - "beta": 5.89, - "maxBatchSize": 6, - "atTokens": 512 - } - ] - } - ``` - - Performance data includes - - - `accCount`: number of accelerator (cards) - - `alpha` and `beta`: parameters (in msec) of the linear approximation of inter-token latency (ITL) as a function of the batch size (n), *ITL = alpha + beta . n* - - `maxBatchSize`: maximum batch size to use, beyond which performance deteriorates - - `atTokens`: average number of tokens used when determining the `maxBatchSize` - -1. **Service class data**: For all service classes, the specification, such as name, priority, and SLO targets for a service class. An example follows. - - ```json - { - "serviceClasses": [ - { - "name": "Premium", - "model": "granite_13b", - "priority": 1, - "slo-itl": 40, - "slo-ttw": 500 - }, - { - "name": "Premium", - "model": "llama_70b", - "priority": 1, - "slo-itl": 80, - "slo-ttw": 500 - }, - { - "name": "Bronze", - "model": "granite_13b", - "priority": 2, - "slo-itl": 80, - "slo-ttw": 1000 - }, - { - "name": "Batch2K", - "model": "mixtral_8_7b", - "priority": 4, - "slo-tps": 2000 - }, - ] - } - ``` - - The service class specification includes - - - `slo-itl`: target SLO for ITL (msec) - - `slo-ttw` target SLO for request waiting (queueing) time (msec) - - `slo-tps` target SLO for throughput (tokens/sec) - -1. **Server data**: For all inference servers, the name of the server, the model and service class it serves (currently, assuming a single model and service class per server), and current and desired allocations. The current allocation reflects the state of the server and the desired allocation is provided by the Optimizer (as a solution to an optimization problem). An allocation includes accelerator, number of replicas, maximum batch size, cost, and observed or anticipated average ITL and waiting time, as well as load data. The load data includes statistical metrics about request arrivals and message lengths (number of tokens). An example follows. - - ```json - { - "servers": [ - { - "name": "Premium-granite_13b", - "class": "Premium", - "model": "granite_13b", - "currentAlloc": { - "accelerator": "A100", - "numReplicas": 1, - "maxBatch": 16, - "cost": 40, - "itlAverage": 25.2, - "waitAverage": 726.5, - "load": { - "arrivalRate": 100, - "avgLength": 999, - "arrivalCOV": 1.0, - "serviceCOV": 1.0 - } - }, - "desiredAlloc": { - "accelerator": "G2", - "numReplicas": 2, - "maxBatch": 19, - "cost": 46, - "itlAverage": 21.16437, - "waitAverage": 102.09766, - "load": { - "arrivalRate": 60, - "avgLength": 1024, - "arrivalCOV": 1.0, - "serviceCOV": 1.0 - } - } - } - ] - } - ``` - -1. **Optimizer data**: Optional flags for the Optimizer. An example follows. - - ```json - { - "optimizer": { - "unlimited": true, - "heterogeneous": false, - "milpSolver" : false, - "useCplex" : false - } - } - ``` - - The flags are as follows. - - - `unlimited`: The available number of accelerator types is unlimited (used in capacity planning mode), as opposed to being limited to the specified number (used in cluster mode). - - `heterogeneous`: Whether servers accomodate heterogeneous accelerators for their replicas, e.g. five replicas of a server, two of which run on A100 and the other three run on G2. - - `milpSolver`: Option to use an MILP (mixed Integer Linear Programming) problem solver, or rely on a (default) greedy algorithm. Currently, the provided solvers are: lpSolve and CPLEX. - - `useCplex`: If using an MILP solver, use CPLEX. - -The output of the Optimizer is an Allocation Solution, in addition to updating the desired allocation of all servers. - -**Allocation solution data**: A map from server name to Allocation Data. An example follows. - -```json -{ - "allocations": { - "Premium-granite_13b": { - "accelerator": "G2", - "numReplicas": 2, - "maxBatch": 19, - "cost": 46, - "itlAverage": 21.16437, - "waitAverage": 102.09766, - "load": { - "arrivalRate": 60, - "avgLength": 1024, - "arrivalCOV": 1.0, - "serviceCOV": 1.0 - } - } - } -} -``` - -## Commands List - -| Verb | Command | Parameters | Returns | Description | -| --- | :---: | :---: | :---: | --- | -| **Accelerator specs** | | | | | -| /setAccelerators | POST | AcceleratorData | | set specs for all accelerators | -| /getAccelerators | GET | | AcceleratorData | get specs for all accelerators | -| /getAccelerator | GET | name | AcceleratorSpec | get specs for named accelerator | -| /addAccelerator | POST | AcceleratorSpec | | add spec for an accelerator | -| /removeAccelerator | GET | name | | remove the named accelerator | -| **Accelerator type counts** | | | | | -| /setCapacities | POST | CapacityData | | set counts for all accelerator types | -| /getCapacities | GET | | CapacityData | get counts for all accelerator types | -| /getCapacity | GET | name | AcceleratorCount | get count for an accelerator type | -| /setCapacity | POST | AcceleratorCount | | set a count to an accelerator type | -| /removeCapacity | GET | name | | remove count of an accelerator type | -| **Model data** | | | | | -| /setModels | POST | ModelData | | set data for models | -| /getModels | GET | | model names | get names of all models | -| /getModel | GET | name | ModelData | get data for a model | -| /addModel | GET | name | | add a model by name | -| /removeModel | GET | name | | remove the data of a model | -| **Service class data** | | | | | -| /setServiceClasses | POST | ServiceClassData | | set data for service classes | -| /getServiceClasses | GET | | ServiceClassData | get data for all service classes | -| /getServiceClass | GET | name | ServiceClassData | get data for a service class | -| /addServiceClass | GET | name/priority | | add a service class by name | -| /removeServiceClass | GET | name | | remove the data of a service class | -| **Service class targets** | | | | | -| /getServiceClassModelTarget | GET | service class name / model name | ServiceClassSpec | get the SLO targets for a service class and model pair | -| /addServiceClassModelTarget | POST | ServiceClassSpec | | add SLO targets for a service class and model pair | -| /removeServiceClassModelTarget | GET | service class name / model name | | remove the SLO targets for a service class and model pair | -| **Server data** | | | | | -| /setServers | POST | ServerData | | set data for servers | -| /getServers | GET | | ServerData | get data for all servers | -| /getServer | GET | name | ServerSpec | get spec for a server | -| /addServer | POST | ServerSpec | | add a server spec | -| /removeServer | GET | name | | remove the data of a server | -| **Model Accelerator perf data** | | | | | -| /getModelAcceleratorPerf | GET | model name / accelerator name | ModelAcceleratorPerfData | get the perf data for a model and accelerator pair | -| /addModelAcceleratorPerf | POST | ModelAcceleratorPerfData | | add perf data for a model and accelerator pair | -| /removeModelAcceleratorPerf | GET | model name / accelerator name | | remove the perf data for a model and accelerator pair | -| **Optimization** | | | | | -| /optimize | POST | OptimizerData | AllocationSolution | optimize given all system data provided and return optimal solution | -| /optimizeOne | POST | SystemData | AllocationSolution | optimize for system data and return optimal solution (stateless, all system data provided with command) | - -## REST Server modes - -There are two modes to run the server. - -1. **Statefull**: All commands listed above are supported. The server keeps the state as data about various entities, allowing additions, updates, and deletions. Optimization is performed on the system as given by the state at the time `/optimize` is called. -2. **Stateless**: Optimization is performed using the provided system data when `/optimizeOne` is called. Optionally, any command prefixed with `/get` may be called afterwards to get data about various entities. diff --git a/rest-server/base.go b/rest-server/base.go deleted file mode 100644 index 4a58b14ac..000000000 --- a/rest-server/base.go +++ /dev/null @@ -1,38 +0,0 @@ -package rest - -import ( - "os" - - "github.com/gin-gonic/gin" - "github.com/llm-inferno/inferno/pkg/core" -) - -// global pointer to system -var system *core.System - -// Base REST server -type BaseServer struct { - router *gin.Engine -} - -func NewBaseServer() *BaseServer { - return &BaseServer{ - router: gin.Default(), - } -} - -// start server -func (server *BaseServer) Run() { - // instantiate a clean system - system = core.NewSystem() - - host := "" - port := "8080" - if h := os.Getenv(RestHostEnvName); h != "" { - host = h - } - if p := os.Getenv(RestPortEnvName); p != "" { - port = p - } - server.router.Run(host + ":" + port) -} diff --git a/rest-server/defaults.go b/rest-server/defaults.go deleted file mode 100644 index 0f47599db..000000000 --- a/rest-server/defaults.go +++ /dev/null @@ -1,16 +0,0 @@ -package rest - -/** - * Environment variables - */ - -// REST server env names -const RestHostEnvName = "INFERNO_HOST" -const RestPortEnvName = "INFERNO_PORT" - -/** - * Parameters - */ - -// argument for statefull -const DefaultStatefull = "-F" diff --git a/rest-server/handlers.go b/rest-server/handlers.go deleted file mode 100644 index 1ca680ff7..000000000 --- a/rest-server/handlers.go +++ /dev/null @@ -1,428 +0,0 @@ -package rest - -import ( - "fmt" - "net/http" - "strconv" - - "github.com/gin-gonic/gin" - "github.com/llm-inferno/inferno/pkg/config" - "github.com/llm-inferno/inferno/pkg/core" - "github.com/llm-inferno/inferno/pkg/manager" - "github.com/llm-inferno/inferno/pkg/solver" -) - -// Handlers for REST API calls - -func setAccelerators(c *gin.Context) { - var acceleratorData config.AcceleratorData - if err := c.BindJSON(&acceleratorData); err != nil { - return - } - system.SetAcceleratorsFromSpec(&acceleratorData) - c.IndentedJSON(http.StatusOK, acceleratorData) -} - -func getAccelerators(c *gin.Context) { - accMap := system.Accelerators() - gpus := make([]config.AcceleratorSpec, len(accMap)) - i := 0 - for _, acc := range accMap { - gpus[i] = *acc.Spec() - i++ - } - c.IndentedJSON(http.StatusOK, gpus) -} - -func getAccelerator(c *gin.Context) { - name := c.Param("name") - acc := system.Accelerator(name) - if acc == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "accelerator " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, acc.Spec()) -} - -func addAccelerator(c *gin.Context) { - var acc config.AcceleratorSpec - if err := c.BindJSON(&acc); err != nil { - return - } - system.AddAcceleratorFromSpec(acc) - c.IndentedJSON(http.StatusOK, acc) -} - -func removeAccelerator(c *gin.Context) { - name := c.Param("name") - acc := system.Accelerator(name) - if err := system.RemoveAccelerator(name); err != nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "accelerator " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, acc.Spec()) -} - -func setCapacities(c *gin.Context) { - var capacityData config.CapacityData - if err := c.BindJSON(&capacityData); err != nil { - return - } - system.SetCapacityFromSpec(&capacityData) - c.IndentedJSON(http.StatusOK, capacityData) -} - -func getCapacities(c *gin.Context) { - capMap := system.Capacities() - capacities := make([]config.AcceleratorCount, len(capMap)) - i := 0 - for k, v := range capMap { - capacities[i] = config.AcceleratorCount{ - Type: k, - Count: v, - } - i++ - } - c.IndentedJSON(http.StatusOK, config.CapacityData{ - Count: capacities, - }) -} - -func getCapacity(c *gin.Context) { - t := c.Param("type") - cap, exists := system.Capacity(t) - if !exists { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "capacity for " + t + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, config.AcceleratorCount{ - Type: t, - Count: cap, - }) -} - -func setCapacity(c *gin.Context) { - var count config.AcceleratorCount - if err := c.BindJSON(&count); err != nil { - return - } - system.SetCountFromSpec(count) - c.IndentedJSON(http.StatusOK, count) -} - -func removeCapacity(c *gin.Context) { - t := c.Param("type") - cap, _ := system.Capacity(t) - if !system.RemoveCapacity(t) { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "accelerator type " + t + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, config.AcceleratorCount{ - Type: t, - Count: cap, - }) -} - -func setModels(c *gin.Context) { - var modelData config.ModelData - if err := c.BindJSON(&modelData); err != nil { - return - } - system.SetModelsFromSpec(&modelData) - c.IndentedJSON(http.StatusOK, modelData) -} - -func getModels(c *gin.Context) { - modelMap := system.Models() - modelNames := make([]string, len(modelMap)) - i := 0 - for _, model := range modelMap { - modelNames[i] = model.Name() - i++ - } - c.IndentedJSON(http.StatusOK, modelNames) -} - -func getModel(c *gin.Context) { - name := c.Param("name") - model := system.Model(name) - if model == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "model " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, model.Spec()) -} - -func addModel(c *gin.Context) { - name := c.Param("name") - system.AddModel(name) - c.IndentedJSON(http.StatusOK, name) -} - -func removeModel(c *gin.Context) { - name := c.Param("name") - if err := system.RemoveModel(name); err != nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "model " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, name) -} - -func setServiceClasses(c *gin.Context) { - var serviceClassData config.ServiceClassData - if err := c.BindJSON(&serviceClassData); err != nil { - return - } - system.SetServiceClassesFromSpec(&serviceClassData) - c.IndentedJSON(http.StatusOK, serviceClassData) -} - -func getServiceClasses(c *gin.Context) { - svcMap := system.ServiceClasses() - svcs := &config.ServiceClassData{ - Spec: []config.ServiceClassSpec{}, - } - for _, svc := range svcMap { - svcs.Spec = append(svcs.Spec, svc.Spec()...) - } - c.IndentedJSON(http.StatusOK, svcs) -} - -func getServiceClass(c *gin.Context) { - name := c.Param("name") - svc := system.ServiceClass(name) - if svc == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "service class " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, svc.Spec()) -} - -func addServiceClass(c *gin.Context) { - name := c.Param("name") - priority := config.DefaultServiceClassPriority - if prioStr := c.Param("priority"); prioStr != "" { - if prioInt, err := strconv.Atoi(prioStr); err != nil { - c.IndentedJSON(http.StatusBadRequest, gin.H{"message": "service class priority " + prioStr + " invalid"}) - return - } else { - priority = prioInt - } - } - system.AddServiceClass(name, priority) - svc := system.ServiceClass(name) - c.IndentedJSON(http.StatusOK, svc.Spec()) -} - -func removeServiceClass(c *gin.Context) { - name := c.Param("name") - svc := system.ServiceClass(name) - if err := system.RemoveServiceClass(name); err != nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "service class " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, svc.Spec()) -} - -func getServiceClassModelTarget(c *gin.Context) { - name := c.Param("name") - model := c.Param("model") - svc := system.ServiceClass(name) - if svc == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "service class " + name + " not found"}) - return - } - target := svc.ModelTarget(model) - if target == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "model " + model + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, config.ServiceClassSpec{ - Name: name, - Model: model, - SLO_ITL: target.ITL, - SLO_TTW: target.TTW, - SLO_TPS: target.TPS, - }) -} - -func addServiceClassModelTarget(c *gin.Context) { - var targetSpec config.ServiceClassSpec - if err := c.BindJSON(&targetSpec); err != nil { - return - } - svcName := targetSpec.Name - if system.ServiceClass(svcName) == nil { - system.AddServiceClass(svcName, targetSpec.Priority) - } - svc := system.ServiceClass(svcName) - svc.SetTargetFromSpec(&targetSpec) - c.IndentedJSON(http.StatusOK, targetSpec) -} - -func removeServiceClassModelTarget(c *gin.Context) { - name := c.Param("name") - model := c.Param("model") - svc := system.ServiceClass(name) - if svc == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "service class " + name + " not found"}) - return - } - target := svc.ModelTarget(model) - if target == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "model " + model + " not found"}) - return - } - svc.RemoveModelTarget(model) - c.IndentedJSON(http.StatusOK, config.ServiceClassSpec{ - Name: name, - Model: model, - SLO_ITL: target.ITL, - SLO_TTW: target.TTW, - SLO_TPS: target.TPS, - }) -} - -func setServers(c *gin.Context) { - var serverData config.ServerData - if err := c.BindJSON(&serverData); err != nil { - return - } - system.SetServersFromSpec(&serverData) - c.IndentedJSON(http.StatusOK, serverData) -} - -func getServers(c *gin.Context) { - srvMap := system.Servers() - servers := make([]config.ServerSpec, len(srvMap)) - i := 0 - for _, server := range srvMap { - servers[i] = *server.Spec() - i++ - } - serverData := &config.ServerData{ - Spec: servers, - } - c.IndentedJSON(http.StatusOK, serverData) -} - -func getServer(c *gin.Context) { - name := c.Param("name") - server := system.Server(name) - if server == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "server " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, server.Spec()) -} - -func addServer(c *gin.Context) { - var server config.ServerSpec - if err := c.BindJSON(&server); err != nil { - return - } - system.AddServerFromSpec(server) - c.IndentedJSON(http.StatusOK, server) -} - -func removeServer(c *gin.Context) { - name := c.Param("name") - server := system.Server(name) - if err := system.RemoveServer(name); err != nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "server " + name + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, server.Spec()) -} - -func getModelAcceleratorPerf(c *gin.Context) { - name := c.Param("name") - acc := c.Param("acc") - model := system.Model(name) - if model == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "model " + name + " not found"}) - return - } - perfData := model.PerfData(acc) - if perfData == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "accelerator " + acc + " not found"}) - return - } - c.IndentedJSON(http.StatusOK, perfData) -} - -func addModelAcceleratorPerf(c *gin.Context) { - var perfData config.ModelAcceleratorPerfData - if err := c.BindJSON(&perfData); err != nil { - return - } - modelName := perfData.Name - model := system.Model(modelName) - if model == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "model " + modelName + " not found"}) - return - } - model.AddPerfDataFromSpec(&perfData) - c.IndentedJSON(http.StatusOK, perfData) -} - -func removeModelAcceleratorPerf(c *gin.Context) { - name := c.Param("name") - acc := c.Param("acc") - model := system.Model(name) - if model == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "model " + name + " not found"}) - return - } - perfData := model.PerfData(acc) - if perfData == nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "accelerator " + acc + " not found"}) - return - } - model.RemovePerfData(acc) - c.IndentedJSON(http.StatusOK, perfData) -} - -func optimize(c *gin.Context) { - var optimizerSpec config.OptimizerSpec - if err := c.BindJSON(&optimizerSpec); err != nil { - return - } - optimizer := solver.NewOptimizerFromSpec(&optimizerSpec) - manager := manager.NewManager(system, optimizer) - system.Calculate() - if err := manager.Optimize(); err != nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "optimization error: " + err.Error()}) - return - } - solution := system.GenerateSolution() - fmt.Println(system) - c.IndentedJSON(http.StatusOK, solution) -} - -func optimizeOne(c *gin.Context) { - var systemData config.SystemData - if err := c.BindJSON(&systemData); err != nil { - return - } - // start with fresh system - system = core.NewSystem() - optimizerSpec := system.SetFromSpec(&systemData.Spec) - optimizer := solver.NewOptimizerFromSpec(optimizerSpec) - manager := manager.NewManager(system, optimizer) - system.Calculate() - if err := manager.Optimize(); err != nil { - c.IndentedJSON(http.StatusNotFound, gin.H{"message": "optimization error: " + err.Error()}) - return - } - solution := system.GenerateSolution() - fmt.Println(system) - c.IndentedJSON(http.StatusOK, solution) -} - -func applyAllocation(c *gin.Context) { - for _, server := range system.Servers() { - server.ApplyDesiredAlloc() - } - c.IndentedJSON(http.StatusOK, "Done") -} diff --git a/rest-server/interfaces.go b/rest-server/interfaces.go deleted file mode 100644 index ba52ff06d..000000000 --- a/rest-server/interfaces.go +++ /dev/null @@ -1,6 +0,0 @@ -package rest - -// interface to a REST server -type RESTServer interface { - Run() -} diff --git a/rest-server/statefull.go b/rest-server/statefull.go deleted file mode 100644 index 876a7223b..000000000 --- a/rest-server/statefull.go +++ /dev/null @@ -1,57 +0,0 @@ -package rest - -// A statefull REST server with many GET and many POST API calls -type StateFullServer struct { - BaseServer -} - -// create a statefull REST server -func NewStateFullServer() *StateFullServer { - server := &StateFullServer{ - BaseServer: *NewBaseServer(), - } - - server.router.POST("/setAccelerators", setAccelerators) - server.router.GET("/getAccelerators", getAccelerators) - server.router.GET("/getAccelerator/:name", getAccelerator) - server.router.POST("/addAccelerator", addAccelerator) - server.router.GET("/removeAccelerator/:name", removeAccelerator) - - server.router.POST("/setCapacities", setCapacities) - server.router.GET("/getCapacities", getCapacities) - server.router.GET("/getCapacity/:type", getCapacity) - server.router.POST("/setCapacity", setCapacity) - server.router.GET("/removeCapacity/:type", removeCapacity) - - server.router.POST("/setModels", setModels) - server.router.GET("/getModels", getModels) - server.router.GET("/getModel/:name", getModel) - server.router.GET("/addModel/:name", addModel) - server.router.GET("/removeModel/:name", removeModel) - - server.router.POST("/setServiceClasses", setServiceClasses) - server.router.GET("/getServiceClasses", getServiceClasses) - server.router.GET("/getServiceClass/:name", getServiceClass) - server.router.GET("/addServiceClass/:name/:priority", addServiceClass) - server.router.GET("/removeServiceClass/:name", removeServiceClass) - - server.router.GET("/getServiceClassModelTarget/:name/:model", getServiceClassModelTarget) - server.router.POST("/addServiceClassModelTarget", addServiceClassModelTarget) - server.router.GET("/removeServiceClassModelTarget/:name/:model", removeServiceClassModelTarget) - - server.router.POST("/setServers", setServers) - server.router.GET("/getServers", getServers) - server.router.GET("/getServer/:name", getServer) - server.router.POST("/addServer", addServer) - server.router.GET("/removeServer/:name", removeServer) - - server.router.GET("/getModelAcceleratorPerf/:name/:acc", getModelAcceleratorPerf) - server.router.POST("/addModelAcceleratorPerf", addModelAcceleratorPerf) - server.router.GET("/removeModelAcceleratorPerf/:name/:acc", removeModelAcceleratorPerf) - - server.router.POST("/optimize", optimize) - server.router.POST("/optimizeOne", optimizeOne) - server.router.GET("/applyAllocation", applyAllocation) - - return server -} diff --git a/rest-server/stateless.go b/rest-server/stateless.go deleted file mode 100644 index 5099f2463..000000000 --- a/rest-server/stateless.go +++ /dev/null @@ -1,36 +0,0 @@ -package rest - -// A statefull REST server with many GET and one POST API calls -type StateLessServer struct { - BaseServer -} - -// create a stateless REST server -func NewStateLessServer() *StateLessServer { - server := &StateLessServer{ - BaseServer: *NewBaseServer(), - } - - server.router.POST("/optimizeOne", optimizeOne) - - server.router.GET("/getAccelerators", getAccelerators) - server.router.GET("/getAccelerator/:name", getAccelerator) - - server.router.GET("/getCapacities", getCapacities) - server.router.GET("/getCapacity/:type", getCapacity) - - server.router.GET("/getModels", getModels) - server.router.GET("/getModel/:name", getModel) - - server.router.GET("/getServiceClasses", getServiceClasses) - server.router.GET("/getServiceClass/:name", getServiceClass) - - server.router.GET("/getServiceClassModelTarget/:name/:model", getServiceClassModelTarget) - - server.router.GET("/getServers", getServers) - server.router.GET("/getServer/:name", getServer) - - server.router.GET("/getModelAcceleratorPerf/:name/:acc", getModelAcceleratorPerf) - - return server -} diff --git a/sample-data b/sample-data deleted file mode 160000 index aea8a5575..000000000 --- a/sample-data +++ /dev/null @@ -1 +0,0 @@ -Subproject commit aea8a5575632c6ef0ce0b45765fc802e6f6922a5 diff --git a/samples/local-dev/prometheus-deploy-all-in-one.yaml b/samples/local-dev/prometheus-deploy-all-in-one.yaml new file mode 100644 index 000000000..5fc93312c --- /dev/null +++ b/samples/local-dev/prometheus-deploy-all-in-one.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: ["get", "list", "watch"] +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: default +--- +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus +spec: + serviceAccountName: prometheus + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + podMonitorSelector: {} + resources: + requests: + memory: 400Mi +--- diff --git a/samples/local-dev/vllme-deployment-with-service-and-servicemon.yaml b/samples/local-dev/vllme-deployment-with-service-and-servicemon.yaml new file mode 100644 index 000000000..4e06a732c --- /dev/null +++ b/samples/local-dev/vllme-deployment-with-service-and-servicemon.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllme-deployment + # labels: + # inferno.server.managed: "true" + # inferno.server.name: vllm-001 + # inferno.server.model: llama_13b + # inferno.server.class: Premium + # inferno.server.allocation.accelerator: MI250 + # inferno.server.allocation.maxbatchsize: "8" + # inferno.server.load.rpm: "30.2" + # inferno.server.load.numtokens: "1560" +spec: + replicas: 1 + selector: + matchLabels: + app: vllme + template: + metadata: + labels: + app: vllme + spec: + containers: + - name: vllme + image: quay.io/amalvank/vllme:latest + imagePullPolicy: Always + ports: + - containerPort: 80 +--- +apiVersion: v1 +kind: Service +metadata: + name: vllme-service + labels: + app: vllme +spec: + selector: + app: vllme + ports: + - name: vllme + port: 80 + protocol: TCP + targetPort: 80 + nodePort: 30000 + type: NodePort +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: vllme-servicemonitor + labels: + app: vllme +spec: + selector: + matchLabels: + app: vllme + endpoints: + - port: vllme + path: /metrics + interval: 15s + namespaceSelector: + any: true +--- diff --git a/samples/local-dev/vllme-variantautoscaling.yaml b/samples/local-dev/vllme-variantautoscaling.yaml new file mode 100644 index 000000000..266bb6bbb --- /dev/null +++ b/samples/local-dev/vllme-variantautoscaling.yaml @@ -0,0 +1,38 @@ +apiVersion: llmd.ai/v1alpha1 +# Optimizing a variant, create only when the model is deployed and serving traffic +# this is for the collector the collect existing (previous) running metrics of the variant. +kind: VariantAutoscaling +metadata: + # Unique name of the variant + name: vllme-deployment + namespace: default + labels: + inference.optimization/modelName: default + inference.optimization/acceleratorName: A100 +# This is essentially static input to the optimizer +spec: + # OpenAI API compatible name of the model + modelID: default + # Add SLOs in configmap, add reference to this per model data + # to avoid duplication and Move to ISOs when available + sloClassRef: + # Configmap name to load in the same namespace as optimizer object + # we start with static (non-changing) ConfigMaps (for ease of implementation only) + name: premium-slo + # Key (modelID) present inside configmap + key: opt-125m + # Static profiled benchmarked data for a variant running on different accelerators + modelProfile: + accelerators: + - acc: "A100" + accCount: 1 + alpha: "20.58" + beta: "0.41" + maxBatchSize: 32 + atTokens: 512 + - acc: "G2" + accCount: 1 + alpha: "17.15" + beta: "0.34" + maxBatchSize: 38 + atTokens: 512 \ No newline at end of file diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go new file mode 100644 index 000000000..7f26376d0 --- /dev/null +++ b/test/e2e/e2e_suite_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + "os" + "os/exec" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/llm-d-incubation/inferno-autoscaler/test/utils" +) + +var ( + // Optional Environment Variables: + // - CERT_MANAGER_INSTALL_SKIP=true: Skips CertManager installation during test setup. + // These variables are useful if CertManager is already installed, avoiding + // re-installation and conflicts. + skipCertManagerInstall = os.Getenv("CERT_MANAGER_INSTALL_SKIP") == "true" + // isCertManagerAlreadyInstalled will be set true when CertManager CRDs be found on the cluster + isCertManagerAlreadyInstalled = false + + // projectImage is the name of the image which will be build and loaded + // with the code source changes to be tested. + projectImage = "example.com/inferno-autoscaler:v0.0.1" +) + +// TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated, +// temporary environment to validate project changes with the purposed to be used in CI jobs. +// The default setup requires Kind, builds/loads the Manager Docker image locally, and installs +// CertManager. +func TestE2E(t *testing.T) { + RegisterFailHandler(Fail) + _, _ = fmt.Fprintf(GinkgoWriter, "Starting inferno-autoscaler integration test suite\n") + RunSpecs(t, "e2e suite") +} + +var _ = BeforeSuite(func() { + By("building the manager(Operator) image") + cmd := exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectImage)) + _, err := utils.Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build the manager(Operator) image") + + // TODO(user): If you want to change the e2e test vendor from Kind, ensure the image is + // built and available before running the tests. Also, remove the following block. + By("loading the manager(Operator) image on Kind") + err = utils.LoadImageToKindClusterWithName(projectImage) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the manager(Operator) image into Kind") + + // The tests-e2e are intended to run on a temporary cluster that is created and destroyed for testing. + // To prevent errors when tests run in environments with CertManager already installed, + // we check for its presence before execution. + // Setup CertManager before the suite if not skipped and if not already installed + if !skipCertManagerInstall { + By("checking if cert manager is installed already") + isCertManagerAlreadyInstalled = utils.IsCertManagerCRDsInstalled() + if !isCertManagerAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Installing CertManager...\n") + Expect(utils.InstallCertManager()).To(Succeed(), "Failed to install CertManager") + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "WARNING: CertManager is already installed. Skipping installation...\n") + } + } +}) + +var _ = AfterSuite(func() { + // Teardown CertManager after the suite if not skipped and if it was not already installed + if !skipCertManagerInstall && !isCertManagerAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling CertManager...\n") + utils.UninstallCertManager() + } +}) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go new file mode 100644 index 000000000..1133bbae5 --- /dev/null +++ b/test/e2e/e2e_test.go @@ -0,0 +1,329 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/llm-d-incubation/inferno-autoscaler/test/utils" +) + +// namespace where the project is deployed in +const namespace = "inferno-autoscaler-system" + +// serviceAccountName created for the project +const serviceAccountName = "inferno-autoscaler-controller-manager" + +// metricsServiceName is the name of the metrics service of the project +const metricsServiceName = "inferno-autoscaler-controller-manager-metrics-service" + +// metricsRoleBindingName is the name of the RBAC that will be created to allow get the metrics data +const metricsRoleBindingName = "inferno-autoscaler-metrics-binding" + +var _ = Describe("Manager", Ordered, func() { + var controllerPodName string + + // Before running the tests, set up the environment by creating the namespace, + // enforce the restricted security policy to the namespace, installing CRDs, + // and deploying the controller. + BeforeAll(func() { + By("creating manager namespace") + cmd := exec.Command("kubectl", "create", "ns", namespace) + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create namespace") + + By("labeling the namespace to enforce the restricted security policy") + cmd = exec.Command("kubectl", "label", "--overwrite", "ns", namespace, + "pod-security.kubernetes.io/enforce=restricted") + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to label namespace with restricted policy") + + By("installing CRDs") + cmd = exec.Command("make", "install") + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to install CRDs") + + By("deploying the controller-manager") + cmd = exec.Command("make", "deploy", fmt.Sprintf("IMG=%s", projectImage)) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to deploy the controller-manager") + }) + + // After all tests have been executed, clean up by undeploying the controller, uninstalling CRDs, + // and deleting the namespace. + AfterAll(func() { + By("cleaning up the curl pod for metrics") + cmd := exec.Command("kubectl", "delete", "pod", "curl-metrics", "-n", namespace) + _, _ = utils.Run(cmd) + + By("undeploying the controller-manager") + cmd = exec.Command("make", "undeploy") + _, _ = utils.Run(cmd) + + By("uninstalling CRDs") + cmd = exec.Command("make", "uninstall") + _, _ = utils.Run(cmd) + + By("removing manager namespace") + cmd = exec.Command("kubectl", "delete", "ns", namespace) + _, _ = utils.Run(cmd) + }) + + // After each test, check for failures and collect logs, events, + // and pod descriptions for debugging. + AfterEach(func() { + specReport := CurrentSpecReport() + if specReport.Failed() { + By("Fetching controller manager pod logs") + cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) + controllerLogs, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, "Controller logs:\n %s", controllerLogs) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Controller logs: %s", err) + } + + By("Fetching Kubernetes events") + cmd = exec.Command("kubectl", "get", "events", "-n", namespace, "--sort-by=.lastTimestamp") + eventsOutput, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, "Kubernetes events:\n%s", eventsOutput) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Kubernetes events: %s", err) + } + + By("Fetching curl-metrics logs") + cmd = exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace) + metricsOutput, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, "Metrics logs:\n %s", metricsOutput) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get curl-metrics logs: %s", err) + } + + By("Fetching controller manager pod description") + cmd = exec.Command("kubectl", "describe", "pod", controllerPodName, "-n", namespace) + podDescription, err := utils.Run(cmd) + if err == nil { + fmt.Println("Pod description:\n", podDescription) + } else { + fmt.Println("Failed to describe controller pod") + } + } + }) + + SetDefaultEventuallyTimeout(2 * time.Minute) + SetDefaultEventuallyPollingInterval(time.Second) + + Context("Manager", func() { + It("should run successfully", func() { + By("validating that the controller-manager pod is running as expected") + verifyControllerUp := func(g Gomega) { + // Get the name of the controller-manager pod + cmd := exec.Command("kubectl", "get", + "pods", "-l", "control-plane=controller-manager", + "-o", "go-template={{ range .items }}"+ + "{{ if not .metadata.deletionTimestamp }}"+ + "{{ .metadata.name }}"+ + "{{ \"\\n\" }}{{ end }}{{ end }}", + "-n", namespace, + ) + + podOutput, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve controller-manager pod information") + podNames := utils.GetNonEmptyLines(podOutput) + g.Expect(podNames).To(HaveLen(1), "expected 1 controller pod running") + controllerPodName = podNames[0] + g.Expect(controllerPodName).To(ContainSubstring("controller-manager")) + + // Validate the pod's status + cmd = exec.Command("kubectl", "get", + "pods", controllerPodName, "-o", "jsonpath={.status.phase}", + "-n", namespace, + ) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(Equal("Running"), "Incorrect controller-manager pod status") + } + Eventually(verifyControllerUp).Should(Succeed()) + }) + + It("should ensure the metrics endpoint is serving metrics", func() { + By("creating a ClusterRoleBinding for the service account to allow access to metrics") + cmd := exec.Command("kubectl", "create", "clusterrolebinding", metricsRoleBindingName, + "--clusterrole=inferno-autoscaler-metrics-reader", + fmt.Sprintf("--serviceaccount=%s:%s", namespace, serviceAccountName), + ) + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create ClusterRoleBinding") + + By("validating that the metrics service is available") + cmd = exec.Command("kubectl", "get", "service", metricsServiceName, "-n", namespace) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Metrics service should exist") + + By("getting the service account token") + token, err := serviceAccountToken() + Expect(err).NotTo(HaveOccurred()) + Expect(token).NotTo(BeEmpty()) + + By("waiting for the metrics endpoint to be ready") + verifyMetricsEndpointReady := func(g Gomega) { + cmd := exec.Command("kubectl", "get", "endpoints", metricsServiceName, "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(ContainSubstring("8443"), "Metrics endpoint is not ready") + } + Eventually(verifyMetricsEndpointReady).Should(Succeed()) + + By("verifying that the controller manager is serving the metrics server") + verifyMetricsServerStarted := func(g Gomega) { + cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(ContainSubstring("controller-runtime.metrics\tServing metrics server"), + "Metrics server not yet started") + } + Eventually(verifyMetricsServerStarted).Should(Succeed()) + + By("creating the curl-metrics pod to access the metrics endpoint") + cmd = exec.Command("kubectl", "run", "curl-metrics", "--restart=Never", + "--namespace", namespace, + "--image=curlimages/curl:latest", + "--overrides", + fmt.Sprintf(`{ + "spec": { + "containers": [{ + "name": "curl", + "image": "curlimages/curl:latest", + "command": ["/bin/sh", "-c"], + "args": ["curl -v -k -H 'Authorization: Bearer %s' https://%s.%s.svc.cluster.local:8443/metrics"], + "securityContext": { + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + }, + "runAsNonRoot": true, + "runAsUser": 1000, + "seccompProfile": { + "type": "RuntimeDefault" + } + } + }], + "serviceAccount": "%s" + } + }`, token, metricsServiceName, namespace, serviceAccountName)) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create curl-metrics pod") + + By("waiting for the curl-metrics pod to complete.") + verifyCurlUp := func(g Gomega) { + cmd := exec.Command("kubectl", "get", "pods", "curl-metrics", + "-o", "jsonpath={.status.phase}", + "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(Equal("Succeeded"), "curl pod in wrong status") + } + Eventually(verifyCurlUp, 5*time.Minute).Should(Succeed()) + + By("getting the metrics by checking curl-metrics logs") + metricsOutput := getMetricsOutput() + Expect(metricsOutput).To(ContainSubstring( + "controller_runtime_reconcile_total", + )) + }) + + // +kubebuilder:scaffold:e2e-webhooks-checks + + // TODO: Customize the e2e test suite with scenarios specific to your project. + // Consider applying sample/CR(s) and check their status and/or verifying + // the reconciliation by using the metrics, i.e.: + // metricsOutput := getMetricsOutput() + // Expect(metricsOutput).To(ContainSubstring( + // fmt.Sprintf(`controller_runtime_reconcile_total{controller="%s",result="success"} 1`, + // strings.ToLower(), + // )) + }) +}) + +// serviceAccountToken returns a token for the specified service account in the given namespace. +// It uses the Kubernetes TokenRequest API to generate a token by directly sending a request +// and parsing the resulting token from the API response. +func serviceAccountToken() (string, error) { + const tokenRequestRawString = `{ + "apiVersion": "authentication.k8s.io/v1", + "kind": "TokenRequest" + }` + + // Temporary file to store the token request + secretName := fmt.Sprintf("%s-token-request", serviceAccountName) + tokenRequestFile := filepath.Join("/tmp", secretName) + err := os.WriteFile(tokenRequestFile, []byte(tokenRequestRawString), os.FileMode(0o644)) + if err != nil { + return "", err + } + + var out string + verifyTokenCreation := func(g Gomega) { + // Execute kubectl command to create the token + cmd := exec.Command("kubectl", "create", "--raw", fmt.Sprintf( + "/api/v1/namespaces/%s/serviceaccounts/%s/token", + namespace, + serviceAccountName, + ), "-f", tokenRequestFile) + + output, err := cmd.CombinedOutput() + g.Expect(err).NotTo(HaveOccurred()) + + // Parse the JSON output to extract the token + var token tokenRequest + err = json.Unmarshal(output, &token) + g.Expect(err).NotTo(HaveOccurred()) + + out = token.Status.Token + } + Eventually(verifyTokenCreation).Should(Succeed()) + + return out, err +} + +// getMetricsOutput retrieves and returns the logs from the curl pod used to access the metrics endpoint. +func getMetricsOutput() string { + By("getting the curl-metrics logs") + cmd := exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace) + metricsOutput, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to retrieve logs from curl pod") + Expect(metricsOutput).To(ContainSubstring("< HTTP/1.1 200 OK")) + return metricsOutput +} + +// tokenRequest is a simplified representation of the Kubernetes TokenRequest API response, +// containing only the token field that we need to extract. +type tokenRequest struct { + Status struct { + Token string `json:"token"` + } `json:"status"` +} diff --git a/test/utils/utils.go b/test/utils/utils.go new file mode 100644 index 000000000..04a5141cc --- /dev/null +++ b/test/utils/utils.go @@ -0,0 +1,251 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "bufio" + "bytes" + "fmt" + "os" + "os/exec" + "strings" + + . "github.com/onsi/ginkgo/v2" //nolint:golint,revive +) + +const ( + prometheusOperatorVersion = "v0.77.1" + prometheusOperatorURL = "https://github.com/prometheus-operator/prometheus-operator/" + + "releases/download/%s/bundle.yaml" + + certmanagerVersion = "v1.16.3" + certmanagerURLTmpl = "https://github.com/cert-manager/cert-manager/releases/download/%s/cert-manager.yaml" +) + +func warnError(err error) { + _, _ = fmt.Fprintf(GinkgoWriter, "warning: %v\n", err) +} + +// Run executes the provided command within this context +func Run(cmd *exec.Cmd) (string, error) { + dir, _ := GetProjectDir() + cmd.Dir = dir + + if err := os.Chdir(cmd.Dir); err != nil { + _, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err) + } + + cmd.Env = append(os.Environ(), "GO111MODULE=on") + command := strings.Join(cmd.Args, " ") + _, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command) + output, err := cmd.CombinedOutput() + if err != nil { + return string(output), fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output)) + } + + return string(output), nil +} + +// InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics. +func InstallPrometheusOperator() error { + url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) + cmd := exec.Command("kubectl", "create", "-f", url) + _, err := Run(cmd) + return err +} + +// UninstallPrometheusOperator uninstalls the prometheus +func UninstallPrometheusOperator() { + url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) + cmd := exec.Command("kubectl", "delete", "-f", url) + if _, err := Run(cmd); err != nil { + warnError(err) + } +} + +// IsPrometheusCRDsInstalled checks if any Prometheus CRDs are installed +// by verifying the existence of key CRDs related to Prometheus. +func IsPrometheusCRDsInstalled() bool { + // List of common Prometheus CRDs + prometheusCRDs := []string{ + "prometheuses.monitoring.coreos.com", + "prometheusrules.monitoring.coreos.com", + "prometheusagents.monitoring.coreos.com", + } + + cmd := exec.Command("kubectl", "get", "crds", "-o", "custom-columns=NAME:.metadata.name") + output, err := Run(cmd) + if err != nil { + return false + } + crdList := GetNonEmptyLines(output) + for _, crd := range prometheusCRDs { + for _, line := range crdList { + if strings.Contains(line, crd) { + return true + } + } + } + + return false +} + +// UninstallCertManager uninstalls the cert manager +func UninstallCertManager() { + url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) + cmd := exec.Command("kubectl", "delete", "-f", url) + if _, err := Run(cmd); err != nil { + warnError(err) + } +} + +// InstallCertManager installs the cert manager bundle. +func InstallCertManager() error { + url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) + cmd := exec.Command("kubectl", "apply", "-f", url) + if _, err := Run(cmd); err != nil { + return err + } + // Wait for cert-manager-webhook to be ready, which can take time if cert-manager + // was re-installed after uninstalling on a cluster. + cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook", + "--for", "condition=Available", + "--namespace", "cert-manager", + "--timeout", "5m", + ) + + _, err := Run(cmd) + return err +} + +// IsCertManagerCRDsInstalled checks if any Cert Manager CRDs are installed +// by verifying the existence of key CRDs related to Cert Manager. +func IsCertManagerCRDsInstalled() bool { + // List of common Cert Manager CRDs + certManagerCRDs := []string{ + "certificates.cert-manager.io", + "issuers.cert-manager.io", + "clusterissuers.cert-manager.io", + "certificaterequests.cert-manager.io", + "orders.acme.cert-manager.io", + "challenges.acme.cert-manager.io", + } + + // Execute the kubectl command to get all CRDs + cmd := exec.Command("kubectl", "get", "crds") + output, err := Run(cmd) + if err != nil { + return false + } + + // Check if any of the Cert Manager CRDs are present + crdList := GetNonEmptyLines(output) + for _, crd := range certManagerCRDs { + for _, line := range crdList { + if strings.Contains(line, crd) { + return true + } + } + } + + return false +} + +// LoadImageToKindClusterWithName loads a local docker image to the kind cluster +func LoadImageToKindClusterWithName(name string) error { + cluster := "kind" + if v, ok := os.LookupEnv("KIND_CLUSTER"); ok { + cluster = v + } + kindOptions := []string{"load", "docker-image", name, "--name", cluster} + cmd := exec.Command("kind", kindOptions...) + _, err := Run(cmd) + return err +} + +// GetNonEmptyLines converts given command output string into individual objects +// according to line breakers, and ignores the empty elements in it. +func GetNonEmptyLines(output string) []string { + var res []string + elements := strings.Split(output, "\n") + for _, element := range elements { + if element != "" { + res = append(res, element) + } + } + + return res +} + +// GetProjectDir will return the directory where the project is +func GetProjectDir() (string, error) { + wd, err := os.Getwd() + if err != nil { + return wd, err + } + wd = strings.Replace(wd, "/test/e2e", "", -1) + return wd, nil +} + +// UncommentCode searches for target in the file and remove the comment prefix +// of the target content. The target content may span multiple lines. +func UncommentCode(filename, target, prefix string) error { + // false positive + // nolint:gosec + content, err := os.ReadFile(filename) + if err != nil { + return err + } + strContent := string(content) + + idx := strings.Index(strContent, target) + if idx < 0 { + return fmt.Errorf("unable to find the code %s to be uncomment", target) + } + + out := new(bytes.Buffer) + _, err = out.Write(content[:idx]) + if err != nil { + return err + } + + scanner := bufio.NewScanner(bytes.NewBufferString(target)) + if !scanner.Scan() { + return nil + } + for { + _, err := out.WriteString(strings.TrimPrefix(scanner.Text(), prefix)) + if err != nil { + return err + } + // Avoid writing a newline in case the previous line was the last in target. + if !scanner.Scan() { + break + } + if _, err := out.WriteString("\n"); err != nil { + return err + } + } + + _, err = out.Write(content[idx+len(target):]) + if err != nil { + return err + } + // false positive + // nolint:gosec + return os.WriteFile(filename, out.Bytes(), 0644) +}