|
29 | 29 | // errModelTooBig indicates that the model is too big to ever load into the |
30 | 30 | // available system memory. |
31 | 31 | errModelTooBig = errors.New("model too big") |
| 32 | + // errRunnerAlreadyActive indicates that a given runner is already active |
| 33 | + // and therefore can't be reconfigured for example |
| 34 | + errRunnerAlreadyActive = errors.New("runner already active") |
32 | 35 | ) |
33 | 36 |
|
34 | 37 | // runnerKey is used to index runners. |
@@ -82,6 +85,8 @@ type loader struct { |
82 | 85 | // timestamps maps slot indices to last usage times. Values in this slice |
83 | 86 | // are only valid if the corresponding reference count is zero. |
84 | 87 | timestamps []time.Time |
| 88 | + // runnerConfigs maps model names to runner configurations |
| 89 | + runnerConfigs map[runnerKey]inference.BackendConfiguration |
85 | 90 | } |
86 | 91 |
|
87 | 92 | // newLoader creates a new loader. |
@@ -122,6 +127,7 @@ func newLoader( |
122 | 127 | references: make([]uint, nSlots), |
123 | 128 | allocations: make([]uint64, nSlots), |
124 | 129 | timestamps: make([]time.Time, nSlots), |
| 130 | + runnerConfigs: make(map[runnerKey]inference.BackendConfiguration), |
125 | 131 | } |
126 | 132 | l.guard <- struct{}{} |
127 | 133 | return l |
@@ -214,9 +220,11 @@ func (l *loader) Unload(ctx context.Context, unload UnloadRequest) int { |
214 | 220 |
|
215 | 221 | return len(l.runners) - func() int { |
216 | 222 | if unload.All { |
| 223 | + l.runnerConfigs = make(map[runnerKey]inference.BackendConfiguration) |
217 | 224 | return l.evict(false) |
218 | 225 | } else { |
219 | 226 | for _, model := range unload.Models { |
| 227 | + delete(l.runnerConfigs, runnerKey{unload.Backend, model, inference.BackendModeCompletion}) |
220 | 228 | // Evict both, completion and embedding models. We should consider |
221 | 229 | // accepting a mode parameter in unload requests. |
222 | 230 | l.evictRunner(unload.Backend, model, inference.BackendModeCompletion) |
@@ -413,9 +421,13 @@ func (l *loader) load(ctx context.Context, backendName, model string, mode infer |
413 | 421 |
|
414 | 422 | // If we've identified a slot, then we're ready to start a runner. |
415 | 423 | if slot >= 0 { |
| 424 | + var runnerConfig *inference.BackendConfiguration |
| 425 | + if rc, ok := l.runnerConfigs[runnerKey{backendName, model, mode}]; ok { |
| 426 | + runnerConfig = &rc |
| 427 | + } |
416 | 428 | // Create the runner. |
417 | 429 | l.log.Infof("Loading %s backend runner with model %s in %s mode", backendName, model, mode) |
418 | | - runner, err := run(l.log, backend, model, mode, slot) |
| 430 | + runner, err := run(l.log, backend, model, mode, slot, runnerConfig) |
419 | 431 | if err != nil { |
420 | 432 | l.log.Warnf("Unable to start %s backend runner with model %s in %s mode: %v", |
421 | 433 | backendName, model, mode, err, |
@@ -492,3 +504,18 @@ func (l *loader) release(runner *runner) { |
492 | 504 | // Signal waiters. |
493 | 505 | l.broadcast() |
494 | 506 | } |
| 507 | + |
| 508 | +func (l *loader) setRunnerConfig(ctx context.Context, backendName, model string, mode inference.BackendMode, runnerConfig inference.BackendConfiguration) error { |
| 509 | + l.lock(ctx) |
| 510 | + defer l.unlock() |
| 511 | + |
| 512 | + runnerId := runnerKey{backendName, model, mode} |
| 513 | + |
| 514 | + if _, ok := l.runners[runnerId]; ok { |
| 515 | + return errRunnerAlreadyActive |
| 516 | + } |
| 517 | + |
| 518 | + l.log.Infof("Configuring %s runner for %s", backendName, model) |
| 519 | + l.runnerConfigs[runnerId] = runnerConfig |
| 520 | + return nil |
| 521 | +} |
0 commit comments