Skip to content

Commit b6d86e5

Browse files
author
Piotr Stankiewicz
committed
inference: Fallback behaviour if reading RAM/VRAM size fails
Signed-off-by: Piotr Stankiewicz <[email protected]>
1 parent cd5f08d commit b6d86e5

File tree

1 file changed

+39
-37
lines changed

1 file changed

+39
-37
lines changed

pkg/inference/scheduling/loader.go

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,6 @@ type runnerInfo struct {
5858
modelRef string
5959
}
6060

61-
// memory is used to keep track of runner memory allocations, and available memory size.
62-
type memory struct {
63-
// ram is the system memory part of the allocation.
64-
ram uint64
65-
// vram is the video memory part of the allocation.
66-
vram uint64
67-
}
68-
6961
// loader manages the loading and unloading of backend runners. It regulates
7062
// active backends in a manner that avoids exhausting system resources. Loaders
7163
// assume that all of their backends have been installed, so no load requests
@@ -81,7 +73,7 @@ type loader struct {
8173
// runnerIdleTimeout is the loader-specific default runner idle timeout.
8274
runnerIdleTimeout time.Duration
8375
// totalMemory is the total system memory allocated to the loader.
84-
totalMemory memory
76+
totalMemory inference.RequiredMemory
8577
// idleCheck is used to signal the run loop when timestamps have updated.
8678
idleCheck chan struct{}
8779
// guard is a sempahore controlling access to all subsequent fields. It is
@@ -92,7 +84,7 @@ type loader struct {
9284
// loadsEnabled signals that loads are currently enabled.
9385
loadsEnabled bool
9486
// availableMemory is the available portion of the loader's total memory.
95-
availableMemory memory
87+
availableMemory inference.RequiredMemory
9688
// waiters is the set of signal channels associated with waiting loaders. We
9789
// use a set of signaling channels (instead of a sync.Cond) to enable
9890
// polling. Each signaling channel should be buffered (with size 1).
@@ -105,16 +97,14 @@ type loader struct {
10597
// references maps slot indices to reference counts.
10698
references []uint
10799
// allocations maps slot indices to memory allocation sizes.
108-
allocations []memory
100+
allocations []inference.RequiredMemory
109101
// timestamps maps slot indices to last usage times. Values in this slice
110102
// are only valid if the corresponding reference count is zero.
111103
timestamps []time.Time
112104
// runnerConfigs maps model names to runner configurations
113105
runnerConfigs map[runnerKey]inference.BackendConfiguration
114106
// openAIRecorder is used to record OpenAI API inference requests and responses.
115107
openAIRecorder *metrics.OpenAIRecorder
116-
// gpuInfo is used to retrieve information about GPU(s) that are available to the loader.
117-
gpuInfo *gpuinfo.GPUInfo
118108
}
119109

120110
// newLoader creates a new loader.
@@ -148,24 +138,28 @@ func newLoader(
148138
// TODO(p1-0tr): improve error handling
149139
vramSize, err := gpuInfo.GetVRAMSize()
150140
if err != nil {
141+
vramSize = 1
151142
log.Warnf("Could not read VRAM size: %s", err)
152143
} else {
153-
log.Infof("Running on system with %dMB VRAM", vramSize/1024.0/1024.0)
144+
log.Infof("Running on system with %dMB VRAM", vramSize/1022/1024)
154145
}
146+
ramSize := uint64(1)
155147
hostInfo, err := sysinfo.Host()
156148
if err != nil {
157149
log.Warnf("Could not read host info: %s", err)
158-
}
159-
ramSize, err := hostInfo.Memory()
160-
if err != nil {
161-
log.Warnf("Could not read host RAM size: %s", err)
162150
} else {
163-
log.Infof("Running on system with %dMB RAM", ramSize.Total/1024.0/1024.0)
151+
ram, err := hostInfo.Memory()
152+
if err != nil {
153+
log.Warnf("Could not read host RAM size: %s", err)
154+
} else {
155+
ramSize = ram.Total
156+
log.Infof("Running on system with %dMB RAM", ramSize/1024/1024)
157+
}
164158
}
165159

166-
totalMemory := memory{
167-
ram: ramSize.Total,
168-
vram: vramSize,
160+
totalMemory := inference.RequiredMemory{
161+
RAM: ramSize,
162+
VRAM: vramSize,
169163
}
170164

171165
// Create the loader.
@@ -182,7 +176,7 @@ func newLoader(
182176
runners: make(map[runnerKey]runnerInfo, nSlots),
183177
slots: make([]*runner, nSlots),
184178
references: make([]uint, nSlots),
185-
allocations: make([]memory, nSlots),
179+
allocations: make([]inference.RequiredMemory, nSlots),
186180
timestamps: make([]time.Time, nSlots),
187181
runnerConfigs: make(map[runnerKey]inference.BackendConfiguration),
188182
openAIRecorder: openAIRecorder,
@@ -239,9 +233,9 @@ func (l *loader) evict(idleOnly bool) int {
239233
)
240234
l.slots[runnerInfo.slot].terminate()
241235
l.slots[runnerInfo.slot] = nil
242-
l.availableMemory.ram += l.allocations[runnerInfo.slot].ram
243-
l.availableMemory.vram += l.allocations[runnerInfo.slot].vram
244-
l.allocations[runnerInfo.slot] = memory{0, 0}
236+
l.availableMemory.RAM += l.allocations[runnerInfo.slot].RAM
237+
l.availableMemory.VRAM += l.allocations[runnerInfo.slot].VRAM
238+
l.allocations[runnerInfo.slot] = inference.RequiredMemory{RAM: 0, VRAM: 0}
245239
l.timestamps[runnerInfo.slot] = time.Time{}
246240
delete(l.runners, r)
247241
}
@@ -261,9 +255,9 @@ func (l *loader) evictRunner(backend, model string, mode inference.BackendMode)
261255
)
262256
l.slots[runnerInfo.slot].terminate()
263257
l.slots[runnerInfo.slot] = nil
264-
l.availableMemory.ram += l.allocations[runnerInfo.slot].ram
265-
l.availableMemory.vram += l.allocations[runnerInfo.slot].vram
266-
l.allocations[runnerInfo.slot] = memory{0, 0}
258+
l.availableMemory.RAM += l.allocations[runnerInfo.slot].RAM
259+
l.availableMemory.VRAM += l.allocations[runnerInfo.slot].VRAM
260+
l.allocations[runnerInfo.slot] = inference.RequiredMemory{RAM: 0, VRAM: 0}
267261
l.timestamps[runnerInfo.slot] = time.Time{}
268262
delete(l.runners, r)
269263
}
@@ -429,8 +423,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
429423
if err != nil {
430424
return nil, err
431425
}
432-
l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024.0/1024.0, memory.VRAM/1024.0/1024.0)
433-
if memory.RAM > l.totalMemory.ram || memory.VRAM > l.totalMemory.vram {
426+
l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)
427+
if l.totalMemory.RAM == 1 {
428+
l.log.Warnf("RAM size unknown. Assume model will fit, but only one.")
429+
memory.RAM = 1
430+
}
431+
if l.totalMemory.VRAM == 1 {
432+
l.log.Warnf("VRAM size unknown. Assume model will fit, but only one.")
433+
memory.VRAM = 1
434+
}
435+
if memory.RAM > l.totalMemory.RAM || memory.VRAM > l.totalMemory.VRAM {
434436
return nil, errModelTooBig
435437
}
436438

@@ -477,12 +479,12 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
477479

478480
// If there's not sufficient memory or all slots are full, then try
479481
// evicting unused runners.
480-
if memory.RAM > l.availableMemory.ram || memory.VRAM > l.availableMemory.vram || len(l.runners) == len(l.slots) {
482+
if memory.RAM > l.availableMemory.RAM || memory.VRAM > l.availableMemory.VRAM || len(l.runners) == len(l.slots) {
481483
l.evict(false)
482484
}
483485

484486
// If there's sufficient memory and a free slot, then find the slot.
485-
if memory.RAM <= l.availableMemory.ram && memory.VRAM <= l.availableMemory.vram && len(l.runners) < len(l.slots) {
487+
if memory.RAM <= l.availableMemory.RAM && memory.VRAM <= l.availableMemory.VRAM && len(l.runners) < len(l.slots) {
486488
for s, runner := range l.slots {
487489
if runner == nil {
488490
slot = s
@@ -522,13 +524,13 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
522524
}
523525

524526
// Perform registration and return the runner.
525-
l.availableMemory.ram -= memory.RAM
526-
l.availableMemory.vram -= memory.VRAM
527+
l.availableMemory.RAM -= memory.RAM
528+
l.availableMemory.VRAM -= memory.VRAM
527529
l.runners[runnerKey{backendName, modelID, mode}] = runnerInfo{slot, modelRef}
528530
l.slots[slot] = runner
529531
l.references[slot] = 1
530-
l.allocations[slot].ram = memory.RAM
531-
l.allocations[slot].vram = memory.VRAM
532+
l.allocations[slot].RAM = memory.RAM
533+
l.allocations[slot].VRAM = memory.VRAM
532534
return runner, nil
533535
}
534536

0 commit comments

Comments
 (0)