@@ -29,16 +29,14 @@ type AISessionPool struct {
2929 sessMap map [string ]* BroadcastSession
3030 inUseSess []* BroadcastSession
3131 suspender * suspender
32- penalty int
3332 mu sync.RWMutex
3433}
3534
36- func NewAISessionPool (selector BroadcastSessionsSelector , suspender * suspender , penalty int ) * AISessionPool {
35+ func NewAISessionPool (selector BroadcastSessionsSelector , suspender * suspender ) * AISessionPool {
3736 return & AISessionPool {
3837 selector : selector ,
3938 sessMap : make (map [string ]* BroadcastSession ),
4039 suspender : suspender ,
41- penalty : penalty ,
4240 mu : sync.RWMutex {},
4341 }
4442}
@@ -103,6 +101,10 @@ func (pool *AISessionPool) Add(sessions []*BroadcastSession) {
103101 pool .mu .Lock ()
104102 defer pool .mu .Unlock ()
105103
104+ // If we try to add new sessions to the pool the suspender
105+ // should treat this as a refresh
106+ pool .suspender .signalRefresh ()
107+
106108 var uniqueSessions []* BroadcastSession
107109 for _ , sess := range sessions {
108110 if _ , ok := pool .sessMap [sess .Transcoder ()]; ok {
@@ -124,14 +126,10 @@ func (pool *AISessionPool) Remove(sess *BroadcastSession) {
124126 delete (pool .sessMap , sess .Transcoder ())
125127 pool .inUseSess = removeSessionFromList (pool .inUseSess , sess )
126128
129+ // Magic number for now
130+ penalty := 3
127131 // If this method is called assume that the orch should be suspended
128- // as well. Since AISessionManager re-uses the pools the suspension
129- // penalty needs to consider the current suspender count to set the penalty
130- lastCount , ok := pool .suspender .list [sess .Transcoder ()]
131- penalty := pool .suspender .count + pool .penalty
132- if ok {
133- penalty -= lastCount
134- }
132+ // as well
135133 pool .suspender .suspend (sess .Transcoder (), penalty )
136134}
137135
@@ -158,14 +156,12 @@ type AISessionSelector struct {
158156 // The time until the pools should be refreshed with orchs from discovery
159157 ttl time.Duration
160158 lastRefreshTime time.Time
161- initialPoolSize int
162159
163160 cap core.Capability
164161 modelID string
165162
166163 node * core.LivepeerNode
167164 suspender * suspender
168- penalty int
169165 os drivers.OSSession
170166}
171167
@@ -184,10 +180,8 @@ func NewAISessionSelector(ctx context.Context, cap core.Capability, modelID stri
184180 // The latency score in this context is just the latency of the last completed request for a session
185181 // The "good enough" latency score is set to 0.0 so the selector will always select unknown sessions first
186182 minLS := 0.0
187- // Session pool suspender starts at 0. Suspension is 3 requests if there are errors from the orchestrator
188- penalty := 3
189- warmPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore , warmCaps ), suspender , penalty )
190- coldPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore , coldCaps ), suspender , penalty )
183+ warmPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore , warmCaps ), suspender )
184+ coldPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore , coldCaps ), suspender )
191185 sel := & AISessionSelector {
192186 warmPool : warmPool ,
193187 coldPool : coldPool ,
@@ -196,7 +190,6 @@ func NewAISessionSelector(ctx context.Context, cap core.Capability, modelID stri
196190 modelID : modelID ,
197191 node : node ,
198192 suspender : suspender ,
199- penalty : penalty ,
200193 os : drivers .NodeStorage .NewSession (strconv .Itoa (int (cap )) + "_" + modelID ),
201194 }
202195
@@ -225,26 +218,11 @@ func newAICapabilities(cap core.Capability, modelID string, warm bool, minVersio
225218 return caps
226219}
227220
228- // selectorIsEmpty returns true if no orchestrators are in the warm or cold pools.
229- func (sel * AISessionSelector ) SelectorIsEmpty () bool {
230- return sel .warmPool .Size () == 0 && sel .coldPool .Size () == 0
231- }
232-
233221func (sel * AISessionSelector ) Select (ctx context.Context ) * AISession {
234222 shouldRefreshSelector := func () bool {
235- discoveryPoolSize := int (math .Min (float64 (sel .node .OrchestratorPool .Size ()), float64 (sel .initialPoolSize )))
236-
237- // If the selector is empty, release all orchestrators from suspension and
238- // try refresh.
239- if sel .SelectorIsEmpty () {
240- clog .Infof (ctx , "refreshing sessions, no orchestrators in pools" )
241- for i := 0 ; i < sel .penalty ; i ++ {
242- sel .suspender .signalRefresh ()
243- }
244- }
245-
246223 // Refresh if the # of sessions across warm and cold pools falls below the smaller of the maxRefreshSessionsThreshold and
247224 // 1/2 the total # of orchs that can be queried during discovery
225+ discoveryPoolSize := sel .node .OrchestratorPool .Size ()
248226 if sel .warmPool .Size ()+ sel .coldPool .Size () < int (math .Min (maxRefreshSessionsThreshold , math .Ceil (float64 (discoveryPoolSize )/ 2.0 ))) {
249227 return true
250228 }
@@ -294,10 +272,6 @@ func (sel *AISessionSelector) Remove(sess *AISession) {
294272}
295273
296274func (sel * AISessionSelector ) Refresh (ctx context.Context ) error {
297- // If we try to add new sessions to the pool the suspender
298- // should treat this as a refresh
299- sel .suspender .signalRefresh ()
300-
301275 sessions , err := sel .getSessions (ctx )
302276 if err != nil {
303277 return err
@@ -312,13 +286,6 @@ func (sel *AISessionSelector) Refresh(ctx context.Context) error {
312286 continue
313287 }
314288
315- // We request 100 orchestrators in getSessions above so all Orchestrators are returned with refreshed information
316- // This keeps the suspended Orchestrators out of the pool until the selector is empty or 30 minutes has passed (refresh happens every 10 minutes)
317- if sel .suspender .Suspended (sess .Transcoder ()) > 0 {
318- clog .V (common .DEBUG ).Infof (ctx , "skipping suspended orchestrator=%s" , sess .Transcoder ())
319- continue
320- }
321-
322289 // If the constraint for the modelID are missing skip this session
323290 modelConstraint , ok := constraints .Models [sel .modelID ]
324291 if ! ok {
@@ -334,7 +301,6 @@ func (sel *AISessionSelector) Refresh(ctx context.Context) error {
334301
335302 sel .warmPool .Add (warmSessions )
336303 sel .coldPool .Add (coldSessions )
337- sel .initialPoolSize = len (warmSessions ) + len (coldSessions ) + len (sel .suspender .list )
338304
339305 sel .lastRefreshTime = time .Now ()
340306
@@ -405,8 +371,6 @@ func (c *AISessionManager) Select(ctx context.Context, cap core.Capability, mode
405371 return nil , err
406372 }
407373
408- clog .V (common .DEBUG ).Infof (ctx , "selected orchestrator=%s" , sess .Transcoder ())
409-
410374 return sess , nil
411375}
412376
0 commit comments