@@ -42,12 +42,20 @@ var (
4242type runnerKey struct {
4343 // backend is the backend associated with the runner.
4444 backend string
45- // model is the model associated with the runner.
46- model string
45+ // modelID is the ID (digest) of the model associated with the runner.
46+ modelID string
4747 // mode is the operation mode associated with the runner.
4848 mode inference.BackendMode
4949}
5050
51+ // runnerInfo holds information about a runner including its slot and the original model reference used to load it.
52+ type runnerInfo struct {
53+ // slot is the slot index where the runner is stored.
54+ slot int
55+ // modelRef is the original model reference (tag) used to load the runner.
56+ modelRef string
57+ }
58+
5159// loader manages the loading and unloading of backend runners. It regulates
5260// active backends in a manner that avoids exhausting system resources. Loaders
5361// assume that all of their backends have been installed, so no load requests
@@ -80,7 +88,7 @@ type loader struct {
8088 // polling. Each signaling channel should be buffered (with size 1).
8189 waiters map [chan <- struct {}]bool
8290 // runners maps runner keys to their slot index.
83- runners map [runnerKey ]int
91+ runners map [runnerKey ]runnerInfo
8492 // slots maps slot indices to associated runners. A slot is considered free
8593 // if the runner value in it is nil.
8694 slots []* runner
@@ -151,7 +159,7 @@ func newLoader(
151159 guard : make (chan struct {}, 1 ),
152160 availableMemory : totalMemory ,
153161 waiters : make (map [chan <- struct {}]bool ),
154- runners : make (map [runnerKey ]int , nSlots ),
162+ runners : make (map [runnerKey ]runnerInfo , nSlots ),
155163 slots : make ([]* runner , nSlots ),
156164 references : make ([]uint , nSlots ),
157165 allocations : make ([]uint64 , nSlots ),
@@ -196,24 +204,24 @@ func (l *loader) broadcast() {
196204// lock. It returns the number of remaining runners.
197205func (l * loader ) evict (idleOnly bool ) int {
198206 now := time .Now ()
199- for r , slot := range l .runners {
200- unused := l .references [slot ] == 0
201- idle := unused && now .Sub (l .timestamps [slot ]) > l .runnerIdleTimeout
207+ for r , runnerInfo := range l .runners {
208+ unused := l .references [runnerInfo . slot ] == 0
209+ idle := unused && now .Sub (l .timestamps [runnerInfo . slot ]) > l .runnerIdleTimeout
202210 defunct := false
203211 select {
204- case <- l .slots [slot ].done :
212+ case <- l .slots [runnerInfo . slot ].done :
205213 defunct = true
206214 default :
207215 }
208216 if unused && (! idleOnly || idle || defunct ) {
209- l .log .Infof ("Evicting %s backend runner with model %s in %s mode" ,
210- r .backend , r .model , r .mode ,
217+ l .log .Infof ("Evicting %s backend runner with model %s (%s) in %s mode" ,
218+ r .backend , r .modelID , runnerInfo . modelRef , r .mode ,
211219 )
212- l .slots [slot ].terminate ()
213- l .slots [slot ] = nil
214- l .availableMemory += l .allocations [slot ]
215- l .allocations [slot ] = 0
216- l .timestamps [slot ] = time.Time {}
220+ l .slots [runnerInfo . slot ].terminate ()
221+ l .slots [runnerInfo . slot ] = nil
222+ l .availableMemory += l .allocations [runnerInfo . slot ]
223+ l .allocations [runnerInfo . slot ] = 0
224+ l .timestamps [runnerInfo . slot ] = time.Time {}
217225 delete (l .runners , r )
218226 }
219227 }
@@ -224,17 +232,17 @@ func (l *loader) evict(idleOnly bool) int {
224232// It returns the number of remaining runners.
225233func (l * loader ) evictRunner (backend , model string , mode inference.BackendMode ) int {
226234 allBackends := backend == ""
227- for r , slot := range l .runners {
228- unused := l .references [slot ] == 0
229- if unused && (allBackends || r .backend == backend ) && r .model == model && r .mode == mode {
230- l .log .Infof ("Evicting %s backend runner with model %s in %s mode" ,
231- r .backend , r .model , r .mode ,
235+ for r , runnerInfo := range l .runners {
236+ unused := l .references [runnerInfo . slot ] == 0
237+ if unused && (allBackends || r .backend == backend ) && r .modelID == model && r .mode == mode {
238+ l .log .Infof ("Evicting %s backend runner with model %s (%s) in %s mode" ,
239+ r .backend , r .modelID , runnerInfo . modelRef , r .mode ,
232240 )
233- l .slots [slot ].terminate ()
234- l .slots [slot ] = nil
235- l .availableMemory += l .allocations [slot ]
236- l .allocations [slot ] = 0
237- l .timestamps [slot ] = time.Time {}
241+ l .slots [runnerInfo . slot ].terminate ()
242+ l .slots [runnerInfo . slot ] = nil
243+ l .availableMemory += l .allocations [runnerInfo . slot ]
244+ l .allocations [runnerInfo . slot ] = 0
245+ l .timestamps [runnerInfo . slot ] = time.Time {}
238246 delete (l .runners , r )
239247 }
240248 }
@@ -254,11 +262,12 @@ func (l *loader) Unload(ctx context.Context, unload UnloadRequest) int {
254262 return l .evict (false )
255263 } else {
256264 for _ , model := range unload .Models {
265+ modelID := l .modelManager .ResolveModelID (model )
257266 delete (l .runnerConfigs , runnerKey {unload .Backend , model , inference .BackendModeCompletion })
258267 // Evict both, completion and embedding models. We should consider
259268 // accepting a mode parameter in unload requests.
260- l .evictRunner (unload .Backend , model , inference .BackendModeCompletion )
261- l .evictRunner (unload .Backend , model , inference .BackendModeEmbedding )
269+ l .evictRunner (unload .Backend , modelID , inference .BackendModeCompletion )
270+ l .evictRunner (unload .Backend , modelID , inference .BackendModeEmbedding )
262271 }
263272 return len (l .runners )
264273 }
@@ -282,15 +291,15 @@ func stopAndDrainTimer(timer *time.Timer) {
282291func (l * loader ) idleCheckDuration () time.Duration {
283292 // Compute the oldest usage time for any idle runner.
284293 var oldest time.Time
285- for _ , slot := range l .runners {
294+ for _ , runnerInfo := range l .runners {
286295 select {
287- case <- l .slots [slot ].done :
296+ case <- l .slots [runnerInfo . slot ].done :
288297 // Check immediately if a runner is defunct
289298 return 0
290299 default :
291300 }
292- if l .references [slot ] == 0 {
293- timestamp := l .timestamps [slot ]
301+ if l .references [runnerInfo . slot ] == 0 {
302+ timestamp := l .timestamps [runnerInfo . slot ]
294303 if oldest .IsZero () || timestamp .Before (oldest ) {
295304 oldest = timestamp
296305 }
@@ -378,10 +387,10 @@ func (l *loader) run(ctx context.Context) {
378387 }
379388}
380389
381- // load allocates a runner using the specified backend and model . If allocated,
390+ // load allocates a runner using the specified backend and modelID . If allocated,
382391// it should be released by the caller using the release mechanism (once the
383392// runner is no longer needed).
384- func (l * loader ) load (ctx context.Context , backendName , model string , mode inference.BackendMode ) (* runner , error ) {
393+ func (l * loader ) load (ctx context.Context , backendName , modelID , modelRef string , mode inference.BackendMode ) (* runner , error ) {
385394 // Grab the backend.
386395 backend , ok := l .backends [backendName ]
387396 if ! ok {
@@ -426,20 +435,20 @@ func (l *loader) load(ctx context.Context, backendName, model string, mode infer
426435 }
427436
428437 // See if we can satisfy the request with an existing runner.
429- existing , ok := l .runners [runnerKey {backendName , model , mode }]
438+ existing , ok := l .runners [runnerKey {backendName , modelID , mode }]
430439 if ok {
431440 select {
432- case <- l .slots [existing ].done :
433- l .log .Warnf ("%s runner for %s is defunct. Waiting for it to be evicted." , backendName , model )
434- if l .references [existing ] == 0 {
435- l .evictRunner (backendName , model , mode )
441+ case <- l .slots [existing . slot ].done :
442+ l .log .Warnf ("%s runner for %s is defunct. Waiting for it to be evicted." , backendName , existing . modelRef )
443+ if l .references [existing . slot ] == 0 {
444+ l .evictRunner (backendName , modelID , mode )
436445 } else {
437446 goto WaitForChange
438447 }
439448 default :
440- l .references [existing ] += 1
441- l .timestamps [existing ] = time.Time {}
442- return l .slots [existing ], nil
449+ l .references [existing . slot ] += 1
450+ l .timestamps [existing . slot ] = time.Time {}
451+ return l .slots [existing . slot ], nil
443452 }
444453 }
445454
@@ -462,15 +471,15 @@ func (l *loader) load(ctx context.Context, backendName, model string, mode infer
462471 // If we've identified a slot, then we're ready to start a runner.
463472 if slot >= 0 {
464473 var runnerConfig * inference.BackendConfiguration
465- if rc , ok := l .runnerConfigs [runnerKey {backendName , model , mode }]; ok {
474+ if rc , ok := l .runnerConfigs [runnerKey {backendName , modelID , mode }]; ok {
466475 runnerConfig = & rc
467476 }
468477 // Create the runner.
469- l .log .Infof ("Loading %s backend runner with model %s in %s mode" , backendName , model , mode )
470- runner , err := run (l .log , backend , model , mode , slot , runnerConfig , l .openAIRecorder )
478+ l .log .Infof ("Loading %s backend runner with model %s in %s mode" , backendName , modelID , mode )
479+ runner , err := run (l .log , backend , modelID , mode , slot , runnerConfig , l .openAIRecorder )
471480 if err != nil {
472481 l .log .Warnf ("Unable to start %s backend runner with model %s in %s mode: %v" ,
473- backendName , model , mode , err ,
482+ backendName , modelID , mode , err ,
474483 )
475484 return nil , fmt .Errorf ("unable to start runner: %w" , err )
476485 }
@@ -484,14 +493,14 @@ func (l *loader) load(ctx context.Context, backendName, model string, mode infer
484493 if err := runner .wait (ctx ); err != nil {
485494 runner .terminate ()
486495 l .log .Warnf ("Initialization for %s backend runner with model %s in %s mode failed: %v" ,
487- backendName , model , mode , err ,
496+ backendName , modelID , mode , err ,
488497 )
489498 return nil , fmt .Errorf ("error waiting for runner to be ready: %w" , err )
490499 }
491500
492501 // Perform registration and return the runner.
493502 l .availableMemory -= memory
494- l .runners [runnerKey {backendName , model , mode }] = slot
503+ l .runners [runnerKey {backendName , modelID , mode }] = runnerInfo { slot , modelRef }
495504 l .slots [slot ] = runner
496505 l .references [slot ] = 1
497506 l .allocations [slot ] = memory
@@ -523,17 +532,17 @@ func (l *loader) release(runner *runner) {
523532 slot := l .runners [runnerKey {runner .backend .Name (), runner .model , runner .mode }]
524533
525534 // Decrement the runner's reference count.
526- l .references [slot ] -= 1
535+ l .references [slot . slot ] -= 1
527536
528537 // If the runner's reference count is now zero, then check if it is still
529538 // active, and record now as its idle start time and signal the idle
530539 // checker.
531- if l .references [slot ] == 0 {
540+ if l .references [slot . slot ] == 0 {
532541 select {
533542 case <- runner .done :
534543 l .evictRunner (runner .backend .Name (), runner .model , runner .mode )
535544 default :
536- l .timestamps [slot ] = time .Now ()
545+ l .timestamps [slot . slot ] = time .Now ()
537546 select {
538547 case l .idleCheck <- struct {}{}:
539548 default :
@@ -545,22 +554,22 @@ func (l *loader) release(runner *runner) {
545554 l .broadcast ()
546555}
547556
548- func (l * loader ) setRunnerConfig (ctx context.Context , backendName , model string , mode inference.BackendMode , runnerConfig inference.BackendConfiguration ) error {
557+ func (l * loader ) setRunnerConfig (ctx context.Context , backendName , modelID string , mode inference.BackendMode , runnerConfig inference.BackendConfiguration ) error {
549558 l .lock (ctx )
550559 defer l .unlock ()
551560
552- runnerId := runnerKey {backendName , model , mode }
561+ runnerId := runnerKey {backendName , modelID , mode }
553562
554563 // If the configuration hasn't changed, then just return.
555564 if existingConfig , ok := l .runnerConfigs [runnerId ]; ok && reflect .DeepEqual (runnerConfig , existingConfig ) {
556- l .log .Infof ("Configuration for %s runner for model %s unchanged" , backendName , model )
565+ l .log .Infof ("Configuration for %s runner for modelID %s unchanged" , backendName , modelID )
557566 return nil
558567 }
559568
560569 // If there's an active runner whose configuration we want to override, then
561570 // try evicting it (because it may not be in use).
562571 if _ , ok := l .runners [runnerId ]; ok {
563- l .evictRunner (backendName , model , mode )
572+ l .evictRunner (backendName , modelID , mode )
564573 }
565574
566575 // If there's still then active runner, then we can't (or at least
@@ -569,7 +578,7 @@ func (l *loader) setRunnerConfig(ctx context.Context, backendName, model string,
569578 return errRunnerAlreadyActive
570579 }
571580
572- l .log .Infof ("Configuring %s runner for %s" , backendName , model )
581+ l .log .Infof ("Configuring %s runner for %s" , backendName , modelID )
573582 l .runnerConfigs [runnerId ] = runnerConfig
574583 return nil
575584}
0 commit comments