@@ -84,6 +84,7 @@ func (s *Scheduler) routeHandlers() map[string]http.HandlerFunc {
8484 m ["GET " + inference .InferencePrefix + "/status" ] = s .GetBackendStatus
8585 m ["GET " + inference .InferencePrefix + "/ps" ] = s .GetRunningBackends
8686 m ["GET " + inference .InferencePrefix + "/df" ] = s .GetDiskUsage
87+ m ["POST " + inference .InferencePrefix + "/unload" ] = s .Unload
8788 return m
8889}
8990
@@ -289,6 +290,33 @@ func (s *Scheduler) GetDiskUsage(w http.ResponseWriter, _ *http.Request) {
289290 }
290291}
291292
293+ // Unload unloads the specified runners (backend, model) from the backend.
294+ // Currently, this doesn't work for runners that are handling an OpenAI request.
295+ func (s * Scheduler ) Unload (w http.ResponseWriter , r * http.Request ) {
296+ body , err := io .ReadAll (http .MaxBytesReader (w , r .Body , maximumOpenAIInferenceRequestSize ))
297+ if err != nil {
298+ if _ , ok := err .(* http.MaxBytesError ); ok {
299+ http .Error (w , "request too large" , http .StatusBadRequest )
300+ } else {
301+ http .Error (w , "unknown error" , http .StatusInternalServerError )
302+ }
303+ return
304+ }
305+
306+ var unloadRequest UnloadRequest
307+ if err := json .Unmarshal (body , & unloadRequest ); err != nil {
308+ http .Error (w , "invalid request" , http .StatusBadRequest )
309+ return
310+ }
311+
312+ unloadedRunners := UnloadResponse {s .loader .Unload (r .Context (), unloadRequest )}
313+ w .Header ().Set ("Content-Type" , "application/json" )
314+ if err := json .NewEncoder (w ).Encode (unloadedRunners ); err != nil {
315+ http .Error (w , fmt .Sprintf ("Failed to encode response: %v" , err ), http .StatusInternalServerError )
316+ return
317+ }
318+ }
319+
292320// ServeHTTP implements net/http.Handler.ServeHTTP.
293321func (s * Scheduler ) ServeHTTP (w http.ResponseWriter , r * http.Request ) {
294322 s .router .ServeHTTP (w , r )
0 commit comments