fix(scheduling): query "/" to check if a runner is ready

doringeman · doringeman · commit 814e08c9ed1b · 2025-09-24T22:49:33.000+03:00
The llama.cpp server returns an error if the model is still loading: https://github.com/ggml-org/llama.cpp/blob/459c0c2c1a400f960d7b8e8d94d31a8426f80986/tools/server/server.cpp#L4220. Wait for it to be loaded using the correct endpoint, as on /models it doesn't return 503. Signed-off-by: Dorin Geman <dorin.geman@docker.com>
diff --git a/pkg/inference/scheduling/runner.go b/pkg/inference/scheduling/runner.go
@@ -205,7 +205,7 @@ func (r *runner) wait(ctx context.Context) error {
 		default:
 		}
 		// Create and execute a request targeting a known-valid endpoint.
-		readyRequest, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/v1/models", http.NoBody)
+		readyRequest, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/", http.NoBody)
 		if err != nil {
 			return fmt.Errorf("readiness request creation failed: %w", err)
 		}

Original file line number	Diff line number	Diff line change
`@@ -205,7 +205,7 @@ func (r *runner) wait(ctx context.Context) error {`
`205`	`205`	`default:`
`206`	`206`	`}`
`207`	`207`	`// Create and execute a request targeting a known-valid endpoint.`
`208`		`- readyRequest, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/v1/models", http.NoBody)`
	`208`	`+ readyRequest, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/", http.NoBody)`
`209`	`209`	`if err != nil {`
`210`	`210`	`return fmt.Errorf("readiness request creation failed: %w", err)`
`211`	`211`	`}`