@@ -29,7 +29,9 @@ internal class RpcFunctionInvocationDispatcher : IFunctionInvocationDispatcher
29
29
private readonly IEnvironment _environment ;
30
30
private readonly IApplicationLifetime _applicationLifetime ;
31
31
private readonly TimeSpan _shutdownTimeout = TimeSpan . FromSeconds ( 10 ) ;
32
- private readonly TimeSpan thresholdBetweenRestarts = TimeSpan . FromMinutes ( WorkerConstants . WorkerRestartErrorIntervalThresholdInMinutes ) ;
32
+ private readonly TimeSpan _restartWait = TimeSpan . FromSeconds ( 10 ) ;
33
+ private readonly SemaphoreSlim _restartWorkerProcessSLock = new SemaphoreSlim ( 1 , 1 ) ;
34
+ private readonly TimeSpan _thresholdBetweenRestarts = TimeSpan . FromMinutes ( WorkerConstants . WorkerRestartErrorIntervalThresholdInMinutes ) ;
33
35
34
36
private IScriptEventManager _eventManager ;
35
37
private IEnumerable < RpcWorkerConfig > _workerConfigs ;
@@ -48,6 +50,7 @@ internal class RpcFunctionInvocationDispatcher : IFunctionInvocationDispatcher
48
50
private IEnumerable < FunctionMetadata > _functions ;
49
51
private ConcurrentStack < WorkerErrorEvent > _languageWorkerErrors = new ConcurrentStack < WorkerErrorEvent > ( ) ;
50
52
private CancellationTokenSource _processStartCancellationToken = new CancellationTokenSource ( ) ;
53
+ private CancellationTokenSource _disposeToken = new CancellationTokenSource ( ) ;
51
54
private int _debounceMilliSeconds = ( int ) TimeSpan . FromSeconds ( 10 ) . TotalMilliseconds ;
52
55
53
56
public RpcFunctionInvocationDispatcher ( IOptions < ScriptJobHostOptions > scriptHostOptions ,
@@ -310,35 +313,43 @@ internal async Task<IEnumerable<IRpcWorkerChannel>> GetInitializedWorkerChannels
310
313
311
314
public async void WorkerError ( WorkerErrorEvent workerError )
312
315
{
313
- if ( ! _disposing )
316
+ if ( _disposing || _disposed )
314
317
{
315
- if ( string . Equals ( _workerRuntime , workerError . Language ) )
316
- {
317
- _logger . LogDebug ( "Handling WorkerErrorEvent for runtime:{runtime}, workerId:{workerId}. Failed with: {exception}" , workerError . Language , _workerRuntime , workerError . Exception ) ;
318
- AddOrUpdateErrorBucket ( workerError ) ;
319
- await DisposeAndRestartWorkerChannel ( workerError . Language , workerError . WorkerId , workerError . Exception ) ;
320
- }
321
- else
322
- {
323
- _logger . LogDebug ( "Received WorkerErrorEvent for runtime:{runtime}, workerId:{workerId}" , workerError . Language , workerError . WorkerId ) ;
324
- _logger . LogDebug ( "WorkerErrorEvent runtime:{runtime} does not match current runtime:{currentRuntime}. Failed with: {exception}" , workerError . Language , _workerRuntime , workerError . Exception ) ;
325
- }
318
+ return ;
319
+ }
320
+
321
+ if ( string . Equals ( _workerRuntime , workerError . Language ) )
322
+ {
323
+ _logger . LogDebug ( "Handling WorkerErrorEvent for runtime:{runtime}, workerId:{workerId}. Failed with: {exception}" , workerError . Language , _workerRuntime , workerError . Exception ) ;
324
+ AddOrUpdateErrorBucket ( workerError ) ;
325
+ await DisposeAndRestartWorkerChannel ( workerError . Language , workerError . WorkerId , workerError . Exception ) ;
326
+ }
327
+ else
328
+ {
329
+ _logger . LogDebug ( "Received WorkerErrorEvent for runtime:{runtime}, workerId:{workerId}" , workerError . Language , workerError . WorkerId ) ;
330
+ _logger . LogDebug ( "WorkerErrorEvent runtime:{runtime} does not match current runtime:{currentRuntime}. Failed with: {exception}" , workerError . Language , _workerRuntime , workerError . Exception ) ;
326
331
}
327
332
}
328
333
329
334
public async void WorkerRestart ( WorkerRestartEvent workerRestart )
330
335
{
331
- if ( ! _disposing )
336
+ if ( _disposing || _disposed )
332
337
{
333
- _logger . LogDebug ( "Handling WorkerRestartEvent for runtime:{runtime}, workerId:{workerId}" , workerRestart . Language , workerRestart . WorkerId ) ;
334
- await DisposeAndRestartWorkerChannel ( workerRestart . Language , workerRestart . WorkerId ) ;
338
+ return ;
335
339
}
340
+
341
+ _logger . LogDebug ( "Handling WorkerRestartEvent for runtime:{runtime}, workerId:{workerId}" , workerRestart . Language , workerRestart . WorkerId ) ;
342
+ await DisposeAndRestartWorkerChannel ( workerRestart . Language , workerRestart . WorkerId ) ;
336
343
}
337
344
338
345
private async Task DisposeAndRestartWorkerChannel ( string runtime , string workerId , Exception workerException = null )
339
346
{
340
- _logger . LogDebug ( "Attempting to dispose webhost or jobhost channel for workerId: {channelId}, runtime:{language}" , workerId , runtime ) ;
347
+ if ( _disposing || _disposed )
348
+ {
349
+ return ;
350
+ }
341
351
352
+ _logger . LogDebug ( "Attempting to dispose webhost or jobhost channel for workerId: '{channelId}', runtime: '{language}'" , workerId , runtime ) ;
342
353
bool isWebHostChannelDisposed = await _webHostLanguageWorkerChannelManager . ShutdownChannelIfExistsAsync ( runtime , workerId , workerException ) ;
343
354
bool isJobHostChannelDisposed = false ;
344
355
if ( ! isWebHostChannelDisposed )
@@ -348,7 +359,7 @@ private async Task DisposeAndRestartWorkerChannel(string runtime, string workerI
348
359
349
360
if ( ! isWebHostChannelDisposed && ! isJobHostChannelDisposed )
350
361
{
351
- _logger . LogDebug ( "Did not find WebHost or JobHost channel to dispose for workerId: {channelId}, runtime:{language}" , workerId , runtime ) ;
362
+ _logger . LogDebug ( "Did not find WebHost or JobHost channel to dispose for workerId: ' {channelId}' , runtime: ' {language}' " , workerId , runtime ) ;
352
363
}
353
364
354
365
if ( ShouldRestartWorkerChannel ( runtime , isWebHostChannelDisposed , isJobHostChannelDisposed ) )
@@ -360,13 +371,13 @@ private async Task DisposeAndRestartWorkerChannel(string runtime, string workerI
360
371
_logger . LogDebug ( "No initialized worker channels for runtime '{runtime}'. Delaying future invocations" , runtime ) ;
361
372
}
362
373
// Restart worker channel
363
- _logger . LogDebug ( "Restarting worker channel for runtime:{runtime}" , runtime ) ;
364
- await RestartWorkerChannel ( runtime , workerId ) ;
374
+ _logger . LogDebug ( "Restarting worker channel for runtime: ' {runtime}' " , runtime ) ;
375
+ await RestartWorkerChannel ( runtime ) ;
365
376
// State is set back to "Initialized" when worker channel is up again
366
377
}
367
378
else
368
379
{
369
- _logger . LogDebug ( "Skipping worker channel restart for errored worker runtime:{runtime}, current runtime:{currentRuntime}, isWebHostChannel:{isWebHostChannel}, isJobHostChannel:{isJobHostChannel}" ,
380
+ _logger . LogDebug ( "Skipping worker channel restart for errored worker runtime: ' {runtime}' , current runtime: ' {currentRuntime}' , isWebHostChannel: ' {isWebHostChannel}' , isJobHostChannel: ' {isJobHostChannel}' " ,
370
381
runtime , _workerRuntime , isWebHostChannelDisposed , isJobHostChannelDisposed ) ;
371
382
}
372
383
}
@@ -376,11 +387,33 @@ internal bool ShouldRestartWorkerChannel(string runtime, bool isWebHostChannel,
376
387
return string . Equals ( _workerRuntime , runtime , StringComparison . InvariantCultureIgnoreCase ) && ( isWebHostChannel || isJobHostChannel ) ;
377
388
}
378
389
379
- private async Task RestartWorkerChannel ( string runtime , string workerId )
390
+ private async Task RestartWorkerChannel ( string runtime )
380
391
{
392
+ if ( _disposing || _disposed )
393
+ {
394
+ return ;
395
+ }
396
+
381
397
if ( _languageWorkerErrors . Count < ErrorEventsThreshold )
382
398
{
383
- await InitializeJobhostLanguageWorkerChannelAsync ( _languageWorkerErrors . Count ) ;
399
+ try
400
+ {
401
+ // Issue only one restart at a time.
402
+ await _restartWorkerProcessSLock . WaitAsync ( ) ;
403
+ await InitializeJobhostLanguageWorkerChannelAsync ( _languageWorkerErrors . Count ) ;
404
+ }
405
+ finally
406
+ {
407
+ // Wait before releasing the lock to give time for the process to startup and initialize.
408
+ try
409
+ {
410
+ await Task . Delay ( _restartWait , _disposeToken . Token ) ;
411
+ _restartWorkerProcessSLock . Release ( ) ;
412
+ }
413
+ catch ( TaskCanceledException )
414
+ {
415
+ }
416
+ }
384
417
}
385
418
else if ( _jobHostLanguageWorkerChannelManager . GetChannels ( ) . Count ( ) == 0 )
386
419
{
@@ -393,12 +426,12 @@ private void AddOrUpdateErrorBucket(WorkerErrorEvent currentErrorEvent)
393
426
{
394
427
if ( _languageWorkerErrors . TryPeek ( out WorkerErrorEvent top ) )
395
428
{
396
- if ( ( currentErrorEvent . CreatedAt - top . CreatedAt ) > thresholdBetweenRestarts )
429
+ if ( ( currentErrorEvent . CreatedAt - top . CreatedAt ) > _thresholdBetweenRestarts )
397
430
{
398
431
while ( ! _languageWorkerErrors . IsEmpty )
399
432
{
400
433
_languageWorkerErrors . TryPop ( out WorkerErrorEvent popped ) ;
401
- _logger . LogDebug ( $ "Popping out errorEvent createdAt:{ popped . CreatedAt } workerId:{ popped . WorkerId } ") ;
434
+ _logger . LogDebug ( $ "Popping out errorEvent createdAt: ' { popped . CreatedAt } ' workerId: ' { popped . WorkerId } ' ") ;
402
435
}
403
436
}
404
437
}
@@ -407,14 +440,21 @@ private void AddOrUpdateErrorBucket(WorkerErrorEvent currentErrorEvent)
407
440
408
441
protected virtual void Dispose ( bool disposing )
409
442
{
410
- if ( ! _disposed && disposing )
443
+ if ( ! _disposed )
411
444
{
412
- _logger . LogDebug ( "Disposing FunctionDispatcher" ) ;
413
- _workerErrorSubscription . Dispose ( ) ;
414
- _workerRestartSubscription . Dispose ( ) ;
415
- _processStartCancellationToken . Cancel ( ) ;
416
- _processStartCancellationToken . Dispose ( ) ;
417
- _jobHostLanguageWorkerChannelManager . ShutdownChannels ( ) ;
445
+ if ( _disposing )
446
+ {
447
+ _logger . LogDebug ( "Disposing FunctionDispatcher" ) ;
448
+ _disposeToken . Cancel ( ) ;
449
+ _disposeToken . Dispose ( ) ;
450
+ _restartWorkerProcessSLock . Dispose ( ) ;
451
+ _workerErrorSubscription . Dispose ( ) ;
452
+ _workerRestartSubscription . Dispose ( ) ;
453
+ _processStartCancellationToken . Cancel ( ) ;
454
+ _processStartCancellationToken . Dispose ( ) ;
455
+ _jobHostLanguageWorkerChannelManager . ShutdownChannels ( ) ;
456
+ }
457
+
418
458
State = FunctionInvocationDispatcherState . Disposed ;
419
459
_disposed = true ;
420
460
}
@@ -435,7 +475,7 @@ public async Task<bool> RestartWorkerWithInvocationIdAsync(string invocationId)
435
475
{
436
476
if ( channel . IsExecutingInvocation ( invocationId ) )
437
477
{
438
- _logger . LogInformation ( $ "Restarting channel '{ channel . Id } ' that is executing invocation '{ invocationId } ' and timed out.") ;
478
+ _logger . LogDebug ( $ "Restarting channel with workerId: '{ channel . Id } ' that is executing invocation: '{ invocationId } ' and timed out.") ;
439
479
await DisposeAndRestartWorkerChannel ( _workerRuntime , channel . Id ) ;
440
480
return true ;
441
481
}
0 commit comments