@@ -13,6 +13,7 @@ public sealed class MonitoringBackgroundService : BackgroundService
1313 private readonly ILogger < MonitoringBackgroundService > _logger ;
1414 private readonly SemaphoreSlim _concurrencySemaphore ;
1515 private readonly ConcurrentDictionary < Guid , Timer > _endpointTimers = new ( ) ;
16+ private readonly ConcurrentDictionary < Guid , bool > _probeExecuting = new ( ) ;
1617 private readonly int _maxConcurrentProbes ;
1718
1819 public MonitoringBackgroundService ( IServiceProvider serviceProvider ,
@@ -55,18 +56,67 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken)
5556 }
5657 }
5758
58- // Handle graceful shutdown
59- using ( IServiceScope scope = _serviceProvider . CreateScope ( ) )
59+ // Handle graceful shutdown with cancellation support
60+ try
6061 {
61- IOutageDetectionService outageService = scope . ServiceProvider . GetRequiredService < IOutageDetectionService > ( ) ;
62- await outageService . HandleGracefulShutdownAsync ( "Service stopping" , CancellationToken . None ) ;
62+ using ( IServiceScope scope = _serviceProvider . CreateScope ( ) )
63+ {
64+ IOutageDetectionService outageService = scope . ServiceProvider . GetRequiredService < IOutageDetectionService > ( ) ;
65+
66+ // Create a timeout for graceful shutdown (30 seconds max)
67+ // This allows force shutdown while giving reasonable time for data safety
68+ using var shutdownCts = new CancellationTokenSource ( TimeSpan . FromSeconds ( 30 ) ) ;
69+ using var combinedCts = CancellationTokenSource . CreateLinkedTokenSource ( stoppingToken , shutdownCts . Token ) ;
70+
71+ await outageService . HandleGracefulShutdownAsync ( "Service stopping" , combinedCts . Token ) ;
72+
73+ _logger . LogInformation ( "Graceful shutdown completed successfully" ) ;
74+ }
75+ }
76+ catch ( OperationCanceledException ) when ( stoppingToken . IsCancellationRequested )
77+ {
78+ _logger . LogWarning ( "Graceful shutdown was cancelled - service forced to stop" ) ;
79+ }
80+ catch ( OperationCanceledException )
81+ {
82+ _logger . LogWarning ( "Graceful shutdown timed out (30s) - proceeding with forced shutdown" ) ;
83+ }
84+ catch ( Exception ex )
85+ {
86+ _logger . LogError ( ex , "Error during graceful shutdown - proceeding with forced shutdown" ) ;
6387 }
6488
65- // Clean up timers
66- foreach ( Timer timer in _endpointTimers . Values )
89+ // Clean up timers gracefully with cancellation support
90+ var timerCleanupTasks = new List < Task > ( ) ;
91+ foreach ( var kvp in _endpointTimers . ToList ( ) ) // ToList to avoid modification during enumeration
6792 {
68- timer . Dispose ( ) ;
93+ timerCleanupTasks . Add ( StopTimerGracefullyAsync ( kvp . Value , kvp . Key ) ) ;
94+ }
95+
96+ try
97+ {
98+ // Create timeout for timer cleanup (30 seconds max) while respecting external cancellation
99+ using var timerCleanupCts = new CancellationTokenSource ( TimeSpan . FromSeconds ( 30 ) ) ;
100+ using var combinedCleanupCts = CancellationTokenSource . CreateLinkedTokenSource ( stoppingToken , timerCleanupCts . Token ) ;
101+
102+ // Wait for all timers to shutdown gracefully (with timeout and cancellation)
103+ await Task . WhenAll ( timerCleanupTasks ) . WaitAsync ( combinedCleanupCts . Token ) ;
104+
105+ _logger . LogInformation ( "All timers stopped gracefully" ) ;
69106 }
107+ catch ( OperationCanceledException ) when ( stoppingToken . IsCancellationRequested )
108+ {
109+ _logger . LogWarning ( "Timer cleanup was cancelled - service forced to stop" ) ;
110+ }
111+ catch ( OperationCanceledException )
112+ {
113+ _logger . LogWarning ( "Timer cleanup timed out (30s) - some timers may not have stopped cleanly" ) ;
114+ }
115+ catch ( Exception ex )
116+ {
117+ _logger . LogError ( ex , "Error during timer cleanup" ) ;
118+ }
119+
70120 _endpointTimers . Clear ( ) ;
71121
72122 _logger . LogInformation ( "Monitoring background service stopped" ) ;
@@ -91,7 +141,8 @@ private async Task RefreshEndpointsAsync(CancellationToken cancellationToken)
91141 {
92142 if ( _endpointTimers . TryRemove ( endpointId , out Timer ? timer ) )
93143 {
94- timer . Dispose ( ) ;
144+ await StopTimerGracefullyAsync ( timer , endpointId ) ;
145+ _probeExecuting . TryRemove ( endpointId , out _ ) ; // Clean up execution tracking
95146 _logger . LogInformation ( "Stopped monitoring endpoint: {EndpointId}" , endpointId ) ;
96147 }
97148 }
@@ -103,7 +154,16 @@ private async Task RefreshEndpointsAsync(CancellationToken cancellationToken)
103154
104155 if ( _endpointTimers . TryGetValue ( endpoint . Id , out Timer ? existingTimer ) )
105156 {
106- // Update existing timer if interval changed
157+ // Stop timer to prevent race condition, then restart with new interval
158+ existingTimer . Change ( Timeout . Infinite , Timeout . Infinite ) ;
159+
160+ // Wait briefly if probe is currently executing to avoid immediate restart
161+ if ( _probeExecuting . TryGetValue ( endpoint . Id , out bool isExecuting ) && isExecuting )
162+ {
163+ await Task . Delay ( 100 ) ; // Brief delay to let current execution complete
164+ }
165+
166+ // Restart with new interval
107167 existingTimer . Change ( TimeSpan . Zero , TimeSpan . FromMilliseconds ( intervalMs ) ) ;
108168 }
109169 else
@@ -134,6 +194,9 @@ private async Task ProbeEndpointAsync(Guid endpointId)
134194 return ;
135195 }
136196
197+ // Mark probe as executing to prevent timer race conditions
198+ _probeExecuting . TryAdd ( endpointId , true ) ;
199+
137200 try
138201 {
139202 using IServiceScope scope = _serviceProvider . CreateScope ( ) ;
@@ -171,17 +234,66 @@ private async Task ProbeEndpointAsync(Guid endpointId)
171234 }
172235 finally
173236 {
237+ // Clear execution flag
238+ _probeExecuting . TryRemove ( endpointId , out _ ) ;
174239 _concurrencySemaphore . Release ( ) ;
175240 }
176241 }
177242
243+ /// <summary>
244+ /// Gracefully stops a timer by disabling it, waiting for current execution to complete, then disposing.
245+ /// </summary>
246+ private async Task StopTimerGracefullyAsync ( Timer timer , Guid endpointId )
247+ {
248+ try
249+ {
250+ // Stop the timer from firing again
251+ timer . Change ( Timeout . Infinite , Timeout . Infinite ) ;
252+
253+ // Wait for current probe execution to complete (with timeout)
254+ int maxWaitMs = 30000 ; // 30 seconds max wait
255+ int waitedMs = 0 ;
256+ const int checkIntervalMs = 100 ;
257+
258+ while ( waitedMs < maxWaitMs && _probeExecuting . TryGetValue ( endpointId , out bool isExecuting ) && isExecuting )
259+ {
260+ await Task . Delay ( checkIntervalMs ) ;
261+ waitedMs += checkIntervalMs ;
262+ }
263+
264+ if ( waitedMs >= maxWaitMs )
265+ {
266+ _logger . LogWarning ( "Timeout waiting for probe execution to complete for endpoint: {EndpointId}" , endpointId ) ;
267+ }
268+
269+ // Now safe to dispose
270+ timer . Dispose ( ) ;
271+ }
272+ catch ( Exception ex )
273+ {
274+ _logger . LogError ( ex , "Error during graceful timer shutdown for endpoint: {EndpointId}" , endpointId ) ;
275+ timer . Dispose ( ) ; // Force dispose on error
276+ }
277+ }
278+
178279 public override void Dispose ( )
179280 {
180281 _concurrencySemaphore ? . Dispose ( ) ;
181282
283+ // Force dispose all remaining timers (should already be cleaned up in ExecuteAsync)
182284 foreach ( Timer timer in _endpointTimers . Values )
183285 {
184- timer . Dispose ( ) ;
286+ try
287+ {
288+ // Stop timer first, then dispose (no graceful wait in synchronous dispose)
289+ timer . Change ( Timeout . Infinite , Timeout . Infinite ) ;
290+ timer . Dispose ( ) ;
291+ }
292+ catch ( Exception ex )
293+ {
294+ // Log but don't throw during disposal
295+ _logger ? . LogWarning ( ex , "Error disposing timer during service disposal" ) ;
296+ }
185297 }
186298
187299 base . Dispose ( ) ;
0 commit comments