@@ -82,6 +82,8 @@ type loader struct {
8282 // timestamps maps slot indices to last usage times. Values in this slice
8383 // are only valid if the corresponding reference count is zero.
8484 timestamps []time.Time
85+ // runnerConfigs maps model names to runner configurations
86+ runnerConfigs map [runnerKey ]inference.BackendConfiguration
8587}
8688
8789// newLoader creates a new loader.
@@ -122,6 +124,7 @@ func newLoader(
122124 references : make ([]uint , nSlots ),
123125 allocations : make ([]uint64 , nSlots ),
124126 timestamps : make ([]time.Time , nSlots ),
127+ runnerConfigs : make (map [runnerKey ]inference.BackendConfiguration ),
125128 }
126129 l .guard <- struct {}{}
127130 return l
@@ -214,9 +217,11 @@ func (l *loader) Unload(ctx context.Context, unload UnloadRequest) int {
214217
215218 return len (l .runners ) - func () int {
216219 if unload .All {
220+ l .runnerConfigs = make (map [runnerKey ]inference.BackendConfiguration )
217221 return l .evict (false )
218222 } else {
219223 for _ , model := range unload .Models {
224+ delete (l .runnerConfigs , runnerKey {unload .Backend , model , inference .BackendModeCompletion })
220225 // Evict both, completion and embedding models. We should consider
221226 // accepting a mode parameter in unload requests.
222227 l .evictRunner (unload .Backend , model , inference .BackendModeCompletion )
@@ -413,9 +418,13 @@ func (l *loader) load(ctx context.Context, backendName, model string, mode infer
413418
414419 // If we've identified a slot, then we're ready to start a runner.
415420 if slot >= 0 {
421+ var runnerConfig * inference.BackendConfiguration
422+ if rc , ok := l .runnerConfigs [runnerKey {backendName , model , mode }]; ok {
423+ runnerConfig = & rc
424+ }
416425 // Create the runner.
417426 l .log .Infof ("Loading %s backend runner with model %s in %s mode" , backendName , model , mode )
418- runner , err := run (l .log , backend , model , mode , slot )
427+ runner , err := run (l .log , backend , model , mode , slot , runnerConfig )
419428 if err != nil {
420429 l .log .Warnf ("Unable to start %s backend runner with model %s in %s mode: %v" ,
421430 backendName , model , mode , err ,
@@ -492,3 +501,11 @@ func (l *loader) release(runner *runner) {
492501 // Signal waiters.
493502 l .broadcast ()
494503}
504+
505+ func (l * loader ) setRunnerConfig (ctx context.Context , backendName , model string , mode inference.BackendMode , runnerConfig inference.BackendConfiguration ) {
506+ l .lock (ctx )
507+ defer l .unlock ()
508+
509+ l .log .Infof ("Configuring %s runner for %s" , backendName , model )
510+ l .runnerConfigs [runnerKey {backendName , model , mode }] = runnerConfig
511+ }
0 commit comments