Merge pull request #46 from doringeman/unload

doringeman · web-flow · commit c818aab53eb6 · 2025-05-21T18:26:08.000+03:00
Add /engines/unload
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
@@ -61,3 +61,15 @@ type DiskUsage struct {
 	ModelsDiskUsage         float64 `json:"models_disk_usage"`
 	DefaultBackendDiskUsage float64 `json:"default_backend_disk_usage"`
 }
+
+// UnloadRequest is used to specify which models to unload.
+type UnloadRequest struct {
+	All     bool   `json:"all"`
+	Backend string `json:"backend"`
+	Model   string `json:"model"`
+}
+
+// UnloadResponse is used to return the number of unloaded runners (backend, model).
+type UnloadResponse struct {
+	UnloadedRunners int `json:"unloaded_runners"`
+}
diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go
@@ -177,6 +177,42 @@ func (l *loader) evict(idleOnly bool) int {
 	return len(l.runners)
 }
 
+// evictRunner evicts a specific runner. The caller must hold the loader lock.
+// It returns the number of remaining runners.
+func (l *loader) evictRunner(backend, model string) int {
+	allBackends := backend == ""
+	for r, slot := range l.runners {
+		if (allBackends || r.backend == backend) && r.model == model {
+			l.log.Infof("Evicting %s backend runner with model %s in %s mode",
+				r.backend, r.model, r.mode,
+			)
+			l.slots[slot].terminate()
+			l.slots[slot] = nil
+			l.availableMemory += l.allocations[slot]
+			l.allocations[slot] = 0
+			l.timestamps[slot] = time.Time{}
+			delete(l.runners, r)
+		}
+	}
+	return len(l.runners)
+}
+
+// Unload unloads runners and returns the number of unloaded runners.
+func (l *loader) Unload(ctx context.Context, unload UnloadRequest) int {
+	if !l.lock(ctx) {
+		return 0
+	}
+	defer l.unlock()
+
+	return len(l.runners) - func() int {
+		if unload.All {
+			return l.evict(false)
+		} else {
+			return l.evictRunner(unload.Backend, unload.Model)
+		}
+	}()
+}
+
 // stopAndDrainTimer stops and drains a timer without knowing if it was running.
 func stopAndDrainTimer(timer *time.Timer) {
 	timer.Stop()
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
@@ -84,6 +84,7 @@ func (s *Scheduler) routeHandlers() map[string]http.HandlerFunc {
 	m["GET "+inference.InferencePrefix+"/status"] = s.GetBackendStatus
 	m["GET "+inference.InferencePrefix+"/ps"] = s.GetRunningBackends
 	m["GET "+inference.InferencePrefix+"/df"] = s.GetDiskUsage
+	m["POST "+inference.InferencePrefix+"/unload"] = s.Unload
 	return m
 }
 
@@ -289,6 +290,33 @@ func (s *Scheduler) GetDiskUsage(w http.ResponseWriter, _ *http.Request) {
 	}
 }
 
+// Unload unloads the specified runners (backend, model) from the backend.
+// Currently, this doesn't work for runners that are handling an OpenAI request.
+func (s *Scheduler) Unload(w http.ResponseWriter, r *http.Request) {
+	body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
+	if err != nil {
+		if _, ok := err.(*http.MaxBytesError); ok {
+			http.Error(w, "request too large", http.StatusBadRequest)
+		} else {
+			http.Error(w, "unknown error", http.StatusInternalServerError)
+		}
+		return
+	}
+
+	var unloadRequest UnloadRequest
+	if err := json.Unmarshal(body, &unloadRequest); err != nil {
+		http.Error(w, "invalid request", http.StatusBadRequest)
+		return
+	}
+
+	unloadedRunners := UnloadResponse{s.loader.Unload(r.Context(), unloadRequest)}
+	w.Header().Set("Content-Type", "application/json")
+	if err := json.NewEncoder(w).Encode(unloadedRunners); err != nil {
+		http.Error(w, fmt.Sprintf("Failed to encode response: %v", err), http.StatusInternalServerError)
+		return
+	}
+}
+
 // ServeHTTP implements net/http.Handler.ServeHTTP.
 func (s *Scheduler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	s.router.ServeHTTP(w, r)