@@ -58,14 +58,6 @@ type runnerInfo struct {
5858 modelRef string
5959}
6060
61- // memory is used to keep track of runner memory allocations, and available memory size.
62- type memory struct {
63- // ram is the system memory part of the allocation.
64- ram uint64
65- // vram is the video memory part of the allocation.
66- vram uint64
67- }
68-
6961// loader manages the loading and unloading of backend runners. It regulates
7062// active backends in a manner that avoids exhausting system resources. Loaders
7163// assume that all of their backends have been installed, so no load requests
@@ -81,7 +73,7 @@ type loader struct {
8173 // runnerIdleTimeout is the loader-specific default runner idle timeout.
8274 runnerIdleTimeout time.Duration
8375 // totalMemory is the total system memory allocated to the loader.
84- totalMemory memory
76+ totalMemory inference. RequiredMemory
8577 // idleCheck is used to signal the run loop when timestamps have updated.
8678 idleCheck chan struct {}
8779 // guard is a sempahore controlling access to all subsequent fields. It is
@@ -92,7 +84,7 @@ type loader struct {
9284 // loadsEnabled signals that loads are currently enabled.
9385 loadsEnabled bool
9486 // availableMemory is the available portion of the loader's total memory.
95- availableMemory memory
87+ availableMemory inference. RequiredMemory
9688 // waiters is the set of signal channels associated with waiting loaders. We
9789 // use a set of signaling channels (instead of a sync.Cond) to enable
9890 // polling. Each signaling channel should be buffered (with size 1).
@@ -105,16 +97,14 @@ type loader struct {
10597 // references maps slot indices to reference counts.
10698 references []uint
10799 // allocations maps slot indices to memory allocation sizes.
108- allocations []memory
100+ allocations []inference. RequiredMemory
109101 // timestamps maps slot indices to last usage times. Values in this slice
110102 // are only valid if the corresponding reference count is zero.
111103 timestamps []time.Time
112104 // runnerConfigs maps model names to runner configurations
113105 runnerConfigs map [runnerKey ]inference.BackendConfiguration
114106 // openAIRecorder is used to record OpenAI API inference requests and responses.
115107 openAIRecorder * metrics.OpenAIRecorder
116- // gpuInfo is used to retrieve information about GPU(s) that are available to the loader.
117- gpuInfo * gpuinfo.GPUInfo
118108}
119109
120110// newLoader creates a new loader.
@@ -148,24 +138,28 @@ func newLoader(
148138 // TODO(p1-0tr): improve error handling
149139 vramSize , err := gpuInfo .GetVRAMSize ()
150140 if err != nil {
141+ vramSize = 1
151142 log .Warnf ("Could not read VRAM size: %s" , err )
152143 } else {
153- log .Infof ("Running on system with %dMB VRAM" , vramSize / 1024.0 / 1024.0 )
144+ log .Infof ("Running on system with %dMB VRAM" , vramSize / 1022 / 1024 )
154145 }
146+ ramSize := uint64 (1 )
155147 hostInfo , err := sysinfo .Host ()
156148 if err != nil {
157149 log .Warnf ("Could not read host info: %s" , err )
158- }
159- ramSize , err := hostInfo .Memory ()
160- if err != nil {
161- log .Warnf ("Could not read host RAM size: %s" , err )
162150 } else {
163- log .Infof ("Running on system with %dMB RAM" , ramSize .Total / 1024.0 / 1024.0 )
151+ ram , err := hostInfo .Memory ()
152+ if err != nil {
153+ log .Warnf ("Could not read host RAM size: %s" , err )
154+ } else {
155+ ramSize = ram .Total
156+ log .Infof ("Running on system with %dMB RAM" , ramSize / 1024 / 1024 )
157+ }
164158 }
165159
166- totalMemory := memory {
167- ram : ramSize . Total ,
168- vram : vramSize ,
160+ totalMemory := inference. RequiredMemory {
161+ RAM : ramSize ,
162+ VRAM : vramSize ,
169163 }
170164
171165 // Create the loader.
@@ -182,7 +176,7 @@ func newLoader(
182176 runners : make (map [runnerKey ]runnerInfo , nSlots ),
183177 slots : make ([]* runner , nSlots ),
184178 references : make ([]uint , nSlots ),
185- allocations : make ([]memory , nSlots ),
179+ allocations : make ([]inference. RequiredMemory , nSlots ),
186180 timestamps : make ([]time.Time , nSlots ),
187181 runnerConfigs : make (map [runnerKey ]inference.BackendConfiguration ),
188182 openAIRecorder : openAIRecorder ,
@@ -239,9 +233,9 @@ func (l *loader) evict(idleOnly bool) int {
239233 )
240234 l .slots [runnerInfo .slot ].terminate ()
241235 l .slots [runnerInfo .slot ] = nil
242- l .availableMemory .ram += l .allocations [runnerInfo .slot ].ram
243- l .availableMemory .vram += l .allocations [runnerInfo .slot ].vram
244- l .allocations [runnerInfo .slot ] = memory { 0 , 0 }
236+ l .availableMemory .RAM += l .allocations [runnerInfo .slot ].RAM
237+ l .availableMemory .VRAM += l .allocations [runnerInfo .slot ].VRAM
238+ l .allocations [runnerInfo .slot ] = inference. RequiredMemory { RAM : 0 , VRAM : 0 }
245239 l .timestamps [runnerInfo .slot ] = time.Time {}
246240 delete (l .runners , r )
247241 }
@@ -261,9 +255,9 @@ func (l *loader) evictRunner(backend, model string, mode inference.BackendMode)
261255 )
262256 l .slots [runnerInfo .slot ].terminate ()
263257 l .slots [runnerInfo .slot ] = nil
264- l .availableMemory .ram += l .allocations [runnerInfo .slot ].ram
265- l .availableMemory .vram += l .allocations [runnerInfo .slot ].vram
266- l .allocations [runnerInfo .slot ] = memory { 0 , 0 }
258+ l .availableMemory .RAM += l .allocations [runnerInfo .slot ].RAM
259+ l .availableMemory .VRAM += l .allocations [runnerInfo .slot ].VRAM
260+ l .allocations [runnerInfo .slot ] = inference. RequiredMemory { RAM : 0 , VRAM : 0 }
267261 l .timestamps [runnerInfo .slot ] = time.Time {}
268262 delete (l .runners , r )
269263 }
@@ -429,8 +423,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
429423 if err != nil {
430424 return nil , err
431425 }
432- l .log .Infof ("Loading %s, which will require %dMB RAM and %dMB VRAM" , modelID , memory .RAM / 1024.0 / 1024.0 , memory .VRAM / 1024.0 / 1024.0 )
433- if memory .RAM > l .totalMemory .ram || memory .VRAM > l .totalMemory .vram {
426+ l .log .Infof ("Loading %s, which will require %dMB RAM and %dMB VRAM" , modelID , memory .RAM / 1024 / 1024 , memory .VRAM / 1024 / 1024 )
427+ if l .totalMemory .RAM == 1 {
428+ l .log .Warnf ("RAM size unknown. Assume model will fit, but only one." )
429+ memory .RAM = 1
430+ }
431+ if l .totalMemory .VRAM == 1 {
432+ l .log .Warnf ("VRAM size unknown. Assume model will fit, but only one." )
433+ memory .VRAM = 1
434+ }
435+ if memory .RAM > l .totalMemory .RAM || memory .VRAM > l .totalMemory .VRAM {
434436 return nil , errModelTooBig
435437 }
436438
@@ -477,12 +479,12 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
477479
478480 // If there's not sufficient memory or all slots are full, then try
479481 // evicting unused runners.
480- if memory .RAM > l .availableMemory .ram || memory .VRAM > l .availableMemory .vram || len (l .runners ) == len (l .slots ) {
482+ if memory .RAM > l .availableMemory .RAM || memory .VRAM > l .availableMemory .VRAM || len (l .runners ) == len (l .slots ) {
481483 l .evict (false )
482484 }
483485
484486 // If there's sufficient memory and a free slot, then find the slot.
485- if memory .RAM <= l .availableMemory .ram && memory .VRAM <= l .availableMemory .vram && len (l .runners ) < len (l .slots ) {
487+ if memory .RAM <= l .availableMemory .RAM && memory .VRAM <= l .availableMemory .VRAM && len (l .runners ) < len (l .slots ) {
486488 for s , runner := range l .slots {
487489 if runner == nil {
488490 slot = s
@@ -522,13 +524,13 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
522524 }
523525
524526 // Perform registration and return the runner.
525- l .availableMemory .ram -= memory .RAM
526- l .availableMemory .vram -= memory .VRAM
527+ l .availableMemory .RAM -= memory .RAM
528+ l .availableMemory .VRAM -= memory .VRAM
527529 l .runners [runnerKey {backendName , modelID , mode }] = runnerInfo {slot , modelRef }
528530 l .slots [slot ] = runner
529531 l .references [slot ] = 1
530- l .allocations [slot ].ram = memory .RAM
531- l .allocations [slot ].vram = memory .VRAM
532+ l .allocations [slot ].RAM = memory .RAM
533+ l .allocations [slot ].VRAM = memory .VRAM
532534 return runner , nil
533535 }
534536
0 commit comments