-
Notifications
You must be signed in to change notification settings - Fork 105
[WIP] Online Batch Support #490
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
shimib
wants to merge
3
commits into
llm-d:main
Choose a base branch
from
shimib:online-batch
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+906
−5
Draft
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| # Build Stage: using Go 1.24 image | ||
| FROM quay.io/projectquay/golang:1.24 AS builder | ||
| ARG TARGETOS | ||
| ARG TARGETARCH | ||
| ARG COMMIT_SHA=unknown | ||
| ARG BUILD_REF | ||
|
|
||
| WORKDIR /workspace | ||
| # Copy the Go Modules manifests | ||
| COPY go.mod go.mod | ||
| COPY go.sum go.sum | ||
| # cache deps before building and copying source so that we don't need to re-download as much | ||
| # and so that source changes don't invalidate our downloaded layer | ||
| RUN go mod download | ||
|
|
||
| # Copy the go source | ||
| COPY cmd/batch/ cmd/batch/ | ||
| COPY pkg/batch/ pkg/batch/ | ||
| COPY pkg/metrics/ pkg/metrics/ | ||
| COPY pkg/common pkg/common | ||
|
|
||
| # Build | ||
| # the GOARCH has not a default value to allow the binary be built according to the host where the command | ||
| # was called. For example, if we call make image-build in a local env which has the Apple Silicon M1 SO | ||
| # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, | ||
| # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. | ||
| ENV CGO_ENABLED=0 | ||
| ENV GOOS=${TARGETOS:-linux} | ||
| ENV GOARCH=${TARGETARCH} | ||
| RUN go build -a -o bin/batch \ | ||
| -ldflags="-X github.com/llm-d/llm-d-inference-scheduler/pkg/batch/version.CommitSHA=${COMMIT_SHA} -X github.com/llm-d/llm-d-inference-scheduler/pkg/batch/version.BuildRef=${BUILD_REF}" \ | ||
| cmd/batch/main.go | ||
|
|
||
| FROM registry.access.redhat.com/ubi9/ubi-micro:latest | ||
| WORKDIR / | ||
| COPY --from=builder /workspace/bin/batch /app/batch | ||
| USER 65532:65532 | ||
|
|
||
| ENTRYPOINT ["/app/batch"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| package main | ||
|
|
||
| import ( | ||
| "os" | ||
|
|
||
| ctrl "sigs.k8s.io/controller-runtime" | ||
|
|
||
| "github.com/llm-d/llm-d-inference-scheduler/cmd/batch/runner" | ||
| ) | ||
|
|
||
| func main() { | ||
|
|
||
| if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil { | ||
| os.Exit(1) | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| package runner | ||
|
|
||
| import ( | ||
| "context" | ||
| "flag" | ||
| "net/http" | ||
|
|
||
| "github.com/llm-d/llm-d-inference-scheduler/pkg/batch" | ||
| "github.com/llm-d/llm-d-inference-scheduler/pkg/batch/redis" | ||
| uberzap "go.uber.org/zap" | ||
| "go.uber.org/zap/zapcore" | ||
| ctrl "sigs.k8s.io/controller-runtime" | ||
| "sigs.k8s.io/controller-runtime/pkg/log/zap" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" | ||
| ) | ||
|
|
||
| type Runner struct { | ||
| } | ||
|
|
||
| var ( | ||
| setupLog = ctrl.Log.WithName("setup") | ||
| logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") | ||
| concurrency = flag.Int("concurrency", 8, "number of concurrent workers") | ||
| endpoint = flag.String("endpoint", "", "inference endpoint") | ||
| ) | ||
|
|
||
| func NewRunner() *Runner { | ||
| return &Runner{} | ||
| } | ||
|
|
||
| func (r *Runner) Run(ctx context.Context) error { | ||
| opts := zap.Options{ | ||
| Development: true, | ||
| } | ||
| opts.BindFlags(flag.CommandLine) | ||
| flag.Parse() | ||
| initLogging(&opts) | ||
|
|
||
| /*if *tracing { | ||
| err := common.InitTracing(ctx, setupLog) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| }*/ | ||
|
|
||
| ////////setupLog.Info("GIE build", "commit-sha", version.CommitSHA, "build-ref", version.BuildRef) | ||
|
|
||
| // Validate flags | ||
| if err := validateFlags(); err != nil { | ||
| setupLog.Error(err, "Failed to validate flags") | ||
| return err | ||
| } | ||
|
|
||
| // Print all flag values | ||
| flags := make(map[string]any) | ||
| flag.VisitAll(func(f *flag.Flag) { | ||
| flags[f.Name] = f.Value | ||
| }) | ||
| setupLog.Info("Flags processed", "flags", flags) | ||
|
|
||
| httpClient := &http.Client{ | ||
| // TODO: configure | ||
| } | ||
| var policy batch.RequestPolicy = batch.NewRandomRobinPolicy() | ||
|
|
||
| var impl batch.Flow = redis.NewRedisMQFlow("localhost:16379") | ||
| requestChannel := policy.MergeRequestChannels(impl.RequestChannels()).Channel | ||
| for w := 1; w <= *concurrency; w++ { | ||
| go batch.Worker(ctx, *endpoint, httpClient, requestChannel, impl.RetryChannel(), impl.ResultChannel()) | ||
| } | ||
|
|
||
| impl.Start(ctx) | ||
| <-ctx.Done() | ||
| return nil | ||
| } | ||
|
|
||
| // TODO: is this dup of | ||
| func initLogging(opts *zap.Options) { | ||
| // Unless -zap-log-level is explicitly set, use -v | ||
| useV := true | ||
| flag.Visit(func(f *flag.Flag) { | ||
| if f.Name == "zap-log-level" { | ||
| useV = false | ||
| } | ||
| }) | ||
| if useV { | ||
| // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level | ||
| lvl := -1 * (*logVerbosity) | ||
| opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) | ||
| } | ||
|
|
||
| logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) | ||
| ctrl.SetLogger(logger) | ||
| } | ||
|
|
||
| func validateFlags() error { | ||
|
|
||
| return nil | ||
| } |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| # Batch Processor - User Guide | ||
|
|
||
| The batch processor helps in workflows where you have requests that are latency tolerant. I.e., SLOs in minutes/ hours instead of seconds. | ||
|
|
||
| The batch processor pulls requests from a message queue (or several MQs according to a policy), sends to the Inference Gateway (IGW) and retries if necessary (e.g., message was shedded). | ||
|
|
||
|  |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| # Batch Processor | ||
|
|
||
| ## Overview | ||
| The batch processor (BP) provides asynchronous workflows for variable SLO-based inference requests. | ||
|
|
||
|
|
||
| ## Architecture | ||
|
|
||
| An underlying implementation should provide persistent messaging that adhere to the interface defined in [api.go](api.go). | ||
|
|
||
| A pluggable request policy is used to merge multiple request channels into a single request channel on which the batch worker is listening. | ||
|
|
||
| An example for such a policy is a [Random Robin Policy](random_robin_policy.go). | ||
|
|
||
| Each [Batch Processor worker](worker.go) is responsible for pulling requests from the merged request channel, submit to the IGW and apply retry logic if needed. | ||
|
|
||
|
|
||
|
|
||
| ### Requests | ||
|
|
||
| Request messages should have the following format: | ||
| ```json | ||
| { | ||
| "id" : "unique identifier for result mapping", | ||
| "deadline" : "deadline in Unix seconds", | ||
| "payload" : {regular inference payload} | ||
| } | ||
| ``` | ||
|
|
||
| Example: | ||
| ```json | ||
| { | ||
| "id" : "19933123533434", | ||
| "deadline" : "1764045130", | ||
| "payload": {"model":"food-review","prompt":"hi", "max_tokens":10,"temperature":0} | ||
| } | ||
| ``` | ||
|
|
||
| ### Results | ||
|
|
||
| Messages on the results channel will have the following structure: | ||
|
|
||
| ```json | ||
| { | ||
| "id" : "id mapped to the request", | ||
| "payload" : {/*inference payload*/} , | ||
| // or | ||
| "error" : "error's reason" | ||
| } | ||
| ``` | ||
|
|
||
|
|
||
| ## Implementations | ||
|
|
||
| ### Redis | ||
|
|
||
| An example implementation based on Redis is provided which behaves as follows: | ||
|
|
||
| - Redis Lists as the request queues. | ||
| - Redis Sorted Set as the retry exponential backoff implementation. | ||
| - Redis List as the result queue. | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| package batch | ||
|
|
||
| import "context" | ||
|
|
||
| type Flow interface { | ||
| // starts processing requests. | ||
| Start(ctx context.Context) | ||
|
|
||
| // returns the channel for requests. Implementation is responsible for populating this channel. | ||
| RequestChannels() []RequestChannel | ||
| // returns the channel that accepts messages to be retries with their backoff delay. | ||
| RetryChannel() chan RetryMessage | ||
| // returns the channel for storing the results. | ||
| ResultChannel() chan ResultMessage | ||
| } | ||
|
|
||
| type RequestPolicy interface { | ||
| MergeRequestChannels(channels []RequestChannel) RequestChannel | ||
| } | ||
|
|
||
| type RequestMessage struct { | ||
| Id string `json:"id"` | ||
| RetryCount int `json:"retry_count,omitempty"` | ||
| DeadlineUnixSec string `json:"deadline"` | ||
| Payload map[string]any `json:"payload"` | ||
| } | ||
|
|
||
| type RequestChannel struct { | ||
| Channel chan RequestMessage | ||
| Metadata map[string]any | ||
| } | ||
|
|
||
| type RetryMessage struct { | ||
| RequestMessage | ||
| BackoffDurationSeconds float64 | ||
| } | ||
|
|
||
| type ResultMessage struct { | ||
| Id string `json:"id"` | ||
| Payload map[string]any `json:"payload"` | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shimib This only supports /v1/completions. Is that on purpose?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It shouldn't, endpoint will be configurable. Will be in the user guide