@@ -88,7 +88,8 @@ type monitorT struct {
8888 policiesIndex string
8989 limit * rate.Limiter
9090
91- startCh chan struct {}
91+ startCh chan struct {}
92+ dispatchCh chan struct {}
9293}
9394
9495// NewMonitor creates the policy monitor for subscribing agents.
@@ -135,14 +136,17 @@ func (m *monitorT) Run(ctx context.Context) error {
135136
136137 close (m .startCh )
137138
138- var iCtx context.Context
139+ // use a cancellable context so we can stop dispatching changes if a new hit is received.
140+ // the cancel func is manually called before return, or after policies have been dispatched.
141+ iCtx , iCancel := context .WithCancel (ctx )
139142 var trans * apm.Transaction
140143LOOP:
141144 for {
142145 m .log .Trace ().Msg ("policy monitor loop start" )
143- iCtx = ctx
144146 select {
145147 case <- m .kickCh :
148+ cancelOnce (iCtx , iCancel )
149+ iCtx , iCancel = context .WithCancel (ctx )
146150 m .log .Trace ().Msg ("policy monitor kicked" )
147151 if m .bulker .HasTracer () {
148152 trans = m .bulker .StartTransaction ("initial policies" , "policy_monitor" )
@@ -151,20 +155,31 @@ LOOP:
151155
152156 if err := m .loadPolicies (iCtx ); err != nil {
153157 endTrans (trans )
158+ cancelOnce (iCtx , iCancel )
154159 return err
155160 }
156- m .dispatchPending (iCtx )
157- endTrans (trans )
161+ go func (ctx context.Context , cancel context.CancelFunc , trans * apm.Transaction ) {
162+ m .dispatchPending (ctx )
163+ endTrans (trans )
164+ cancelOnce (ctx , cancel )
165+ }(iCtx , iCancel , trans )
158166 case <- m .deployCh :
167+ cancelOnce (iCtx , iCancel )
168+ iCtx , iCancel = context .WithCancel (ctx )
159169 m .log .Trace ().Msg ("policy monitor deploy ch" )
160170 if m .bulker .HasTracer () {
161171 trans = m .bulker .StartTransaction ("forced policies" , "policy_monitor" )
162172 iCtx = apm .ContextWithTransaction (ctx , trans )
163173 }
164174
165- m .dispatchPending (iCtx )
166- endTrans (trans )
175+ go func (ctx context.Context , cancel context.CancelFunc , trans * apm.Transaction ) {
176+ m .dispatchPending (ctx )
177+ endTrans (trans )
178+ cancelOnce (ctx , cancel )
179+ }(iCtx , iCancel , trans )
167180 case hits := <- s .Output (): // TODO would be nice to attach transaction IDs to hits, but would likely need a bigger refactor.
181+ cancelOnce (iCtx , iCancel )
182+ iCtx , iCancel = context .WithCancel (ctx )
168183 m .log .Trace ().Int ("hits" , len (hits )).Msg ("policy monitor hits from sub" )
169184 if m .bulker .HasTracer () {
170185 trans = m .bulker .StartTransaction ("output policies" , "policy_monitor" )
@@ -173,18 +188,33 @@ LOOP:
173188
174189 if err := m .processHits (iCtx , hits ); err != nil {
175190 endTrans (trans )
191+ cancelOnce (iCtx , iCancel )
176192 return err
177193 }
178- m .dispatchPending (iCtx )
179- endTrans (trans )
194+ go func (ctx context.Context , cancel context.CancelFunc , trans * apm.Transaction ) {
195+ m .dispatchPending (ctx )
196+ endTrans (trans )
197+ cancelOnce (ctx , cancel )
198+ }(iCtx , iCancel , trans )
180199 case <- ctx .Done ():
181200 break LOOP
182201 }
183202 }
184203
204+ iCancel ()
185205 return nil
186206}
187207
208+ // cancelOnce calls cancel if the context is not done.
209+ func cancelOnce (ctx context.Context , cancel context.CancelFunc ) {
210+ select {
211+ case <- ctx .Done ():
212+ return
213+ default :
214+ cancel ()
215+ }
216+ }
217+
188218func unmarshalHits (hits []es.HitT ) ([]model.Policy , error ) {
189219 policies := make ([]model.Policy , len (hits ))
190220 for i , hit := range hits {
@@ -224,6 +254,14 @@ func (m *monitorT) waitStart(ctx context.Context) error {
224254// dispatchPending will dispatch all pending policy changes to the subscriptions in the queue.
225255// dispatches are rate limited by the monitor's limiter.
226256func (m * monitorT ) dispatchPending (ctx context.Context ) {
257+ // dispatchCh is used in tests to be able to control when a dispatch execution proceeds
258+ if m .dispatchCh != nil {
259+ select {
260+ case <- m .dispatchCh :
261+ case <- ctx .Done ():
262+ return
263+ }
264+ }
227265 span , ctx := apm .StartSpan (ctx , "dispatch pending" , "dispatch" )
228266 defer span .End ()
229267 m .mut .Lock ()
@@ -243,7 +281,10 @@ func (m *monitorT) dispatchPending(ctx context.Context) {
243281 // If too many (checkin) responses are written concurrently memory usage may explode due to allocating gzip writers.
244282 err := m .limit .Wait (ctx )
245283 if err != nil {
246- m .log .Warn ().Err (err ).Msg ("Policy limit error" )
284+ m .pendingQ .pushFront (s ) // context cancelled before sub is handled, put it back
285+ if ! errors .Is (err , context .Canceled ) {
286+ m .log .Warn ().Err (err ).Msg ("Policy limit error" )
287+ }
247288 return
248289 }
249290 // Lookup the latest policy for this subscription
@@ -257,6 +298,7 @@ func (m *monitorT) dispatchPending(ctx context.Context) {
257298
258299 select {
259300 case <- ctx .Done ():
301+ m .pendingQ .pushFront (s ) // context cancelled before sub is handled, put it back
260302 m .log .Debug ().Err (ctx .Err ()).Msg ("context termination detected in policy dispatch" )
261303 return
262304 case s .ch <- & policy .pp :
0 commit comments