55 "crypto/rand"
66 "encoding/hex"
77 "encoding/json"
8- "errors"
98 "fmt"
109 "log/slog"
1110 "os"
@@ -22,8 +21,6 @@ import (
2221 "google.golang.org/grpc/credentials/insecure"
2322)
2423
25- var ErrMaxWorkersReached = errors .New ("max workers reached" )
26-
2724// ManagedWorker represents a duckdb-service worker process.
2825type ManagedWorker struct {
2926 ID int
@@ -55,6 +52,8 @@ type FlightWorkerPool struct {
5552 sessionCounter SessionCounter // set after SessionManager is created
5653 maxWorkers int // 0 = unlimited
5754 shuttingDown bool
55+ workerSem chan struct {} // buffered to maxWorkers; nil when unlimited
56+ shutdownCh chan struct {} // closed by ShutdownAll to unblock queued waiters
5857}
5958
6059// NewFlightWorkerPool creates a new worker pool.
@@ -66,6 +65,10 @@ func NewFlightWorkerPool(socketDir, configPath string, maxWorkers int) *FlightWo
6665 configPath : configPath ,
6766 binaryPath : binaryPath ,
6867 maxWorkers : maxWorkers ,
68+ shutdownCh : make (chan struct {}),
69+ }
70+ if maxWorkers > 0 {
71+ pool .workerSem = make (chan struct {}, maxWorkers )
6972 }
7073 observeControlPlaneWorkers (0 )
7174 return pool
@@ -235,30 +238,31 @@ func (p *FlightWorkerPool) SpawnMinWorkers(count int) error {
235238 return p .SpawnAll (count )
236239}
237240
238- // AcquireWorker returns a worker for a new session. It first tries to claim an
239- // idle pre-warmed worker (one with no active sessions). If none are available,
240- // it spawns a new one. The max-workers check is performed atomically under the
241- // write lock to prevent TOCTOU races from concurrent connections.
242- func (p * FlightWorkerPool ) AcquireWorker () (* ManagedWorker , error ) {
241+ // AcquireWorker returns a worker for a new session. When maxWorkers is set,
242+ // callers block in FIFO order on the semaphore until a slot is available,
243+ // the context is cancelled, or the pool shuts down.
244+ // Once a slot is acquired, it first tries to claim an idle pre-warmed worker
245+ // (one with no active sessions). If none are available, it spawns a new one.
246+ func (p * FlightWorkerPool ) AcquireWorker (ctx context.Context ) (* ManagedWorker , error ) {
247+ // Block until a semaphore slot is available (FIFO via Go's sudog queue).
248+ if p .workerSem != nil {
249+ select {
250+ case p .workerSem <- struct {}{}:
251+ // Got a slot
252+ case <- ctx .Done ():
253+ return nil , fmt .Errorf ("timed out waiting for available worker (max_workers=%d): %w" , p .maxWorkers , ctx .Err ())
254+ case <- p .shutdownCh :
255+ return nil , fmt .Errorf ("pool is shutting down" )
256+ }
257+ }
258+
243259 p .mu .Lock ()
244260 if p .shuttingDown {
245261 p .mu .Unlock ()
262+ p .releaseWorkerSem ()
246263 return nil , fmt .Errorf ("pool is shutting down" )
247264 }
248265
249- // Check max-workers cap atomically under the write lock
250- if p .maxWorkers > 0 && len (p .workers ) >= p .maxWorkers {
251- // Even at the cap, we may have idle pre-warmed workers to reuse.
252- // Only fail if all existing workers are busy.
253- idle := p .findIdleWorkerLocked ()
254- if idle != nil {
255- p .mu .Unlock ()
256- return idle , nil
257- }
258- p .mu .Unlock ()
259- return nil , fmt .Errorf ("%w (%d)" , ErrMaxWorkersReached , p .maxWorkers )
260- }
261-
262266 // Try to claim an idle pre-warmed worker before spawning a new one
263267 idle := p .findIdleWorkerLocked ()
264268 if idle != nil {
@@ -271,16 +275,28 @@ func (p *FlightWorkerPool) AcquireWorker() (*ManagedWorker, error) {
271275 p .mu .Unlock ()
272276
273277 if err := p .SpawnWorker (id ); err != nil {
278+ p .releaseWorkerSem ()
274279 return nil , err
275280 }
276281
277282 w , ok := p .Worker (id )
278283 if ! ok {
284+ p .releaseWorkerSem ()
279285 return nil , fmt .Errorf ("worker %d not found after spawn" , id )
280286 }
281287 return w , nil
282288}
283289
290+ // releaseWorkerSem drains one token from the semaphore (non-blocking).
291+ func (p * FlightWorkerPool ) releaseWorkerSem () {
292+ if p .workerSem != nil {
293+ select {
294+ case <- p .workerSem :
295+ default :
296+ }
297+ }
298+ }
299+
284300// findIdleWorkerLocked returns a live worker with no active sessions, or nil.
285301// Caller must hold p.mu (read or write lock).
286302func (p * FlightWorkerPool ) findIdleWorkerLocked () * ManagedWorker {
@@ -312,18 +328,23 @@ func (p *FlightWorkerPool) RetireWorker(id int) {
312328 p .mu .Unlock ()
313329 observeControlPlaneWorkers (workerCount )
314330
331+ // Release semaphore slot so a queued waiter can proceed.
332+ p .releaseWorkerSem ()
333+
315334 // Run the actual process cleanup asynchronously so DestroySession
316335 // doesn't block the connection handler goroutine for up to 3s+.
317336 go retireWorkerProcess (w )
318337}
319338
320339// RetireWorkerIfNoSessions retires a worker only if it has no active sessions.
321340// Used to clean up on session creation failure without retiring pre-warmed workers.
322- func (p * FlightWorkerPool ) RetireWorkerIfNoSessions (id int ) {
341+ // Returns true if the worker was retired (and its semaphore slot released).
342+ func (p * FlightWorkerPool ) RetireWorkerIfNoSessions (id int ) bool {
323343 if p .sessionCounter != nil && p .sessionCounter .SessionCountForWorker (id ) > 0 {
324- return
344+ return false
325345 }
326346 p .RetireWorker (id )
347+ return true
327348}
328349
329350// retireWorkerProcess handles the actual process shutdown and socket cleanup.
@@ -384,6 +405,9 @@ func (p *FlightWorkerPool) ShutdownAll() {
384405 }
385406 p .mu .Unlock ()
386407
408+ // Unblock all goroutines waiting in AcquireWorker's semaphore select.
409+ close (p .shutdownCh )
410+
387411 for _ , w := range workers {
388412 if w .cmd .Process != nil {
389413 slog .Info ("Shutting down worker." , "id" , w .ID , "pid" , w .cmd .Process .Pid )
@@ -483,6 +507,7 @@ func (p *FlightWorkerPool) HealthCheckLoop(ctx context.Context, interval time.Du
483507 _ = w .client .Close ()
484508 }
485509 _ = os .Remove (w .socketPath )
510+ p .releaseWorkerSem ()
486511 default :
487512 // Worker is alive, do a health check.
488513 // Recover nil-pointer panics: w.client.Close() (from a
@@ -535,6 +560,7 @@ func (p *FlightWorkerPool) HealthCheckLoop(ctx context.Context, interval time.Du
535560 _ = w .client .Close ()
536561 }
537562 _ = os .Remove (w .socketPath )
563+ p .releaseWorkerSem ()
538564 }
539565 }
540566 } else {
0 commit comments