Skip to content

Commit 0e0cadb

Browse files
authored
fixing startup deadlock (#11142)
1 parent b50cba3 commit 0e0cadb

File tree

5 files changed

+158
-26
lines changed

5 files changed

+158
-26
lines changed

release_notes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
-->
66
- Adding activity sources for Durable and WebJobs (Kafka and RabbitMQ) (#11137)
77
- Add JitTrace Files for v4.1041
8+
- Fix startup deadlock on transient exceptions (#11142)

src/WebJobs.Script/Host/WorkerFunctionMetadataProvider.cs

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,16 @@
1111
using Microsoft.Azure.WebJobs.Script.Description;
1212
using Microsoft.Azure.WebJobs.Script.Diagnostics.Extensions;
1313
using Microsoft.Azure.WebJobs.Script.Workers.Rpc;
14+
using Microsoft.Extensions.DependencyInjection;
15+
using Microsoft.Extensions.Hosting;
1416
using Microsoft.Extensions.Logging;
1517
using Microsoft.Extensions.Options;
1618
using Newtonsoft.Json;
1719
using Newtonsoft.Json.Linq;
1820

1921
namespace Microsoft.Azure.WebJobs.Script
2022
{
21-
internal class WorkerFunctionMetadataProvider : IWorkerFunctionMetadataProvider
23+
internal class WorkerFunctionMetadataProvider : IWorkerFunctionMetadataProvider, IDisposable
2224
{
2325
private const string _metadataProviderName = "Worker";
2426
private readonly Dictionary<string, ICollection<string>> _functionErrors = new Dictionary<string, ICollection<string>>();
@@ -30,6 +32,7 @@ internal class WorkerFunctionMetadataProvider : IWorkerFunctionMetadataProvider
3032
private readonly JsonSerializerSettings _dateTimeSerializerSettings;
3133
private string _workerRuntime;
3234
private ImmutableArray<FunctionMetadata> _functions;
35+
private IHost _currentJobHost = null;
3336

3437
public WorkerFunctionMetadataProvider(
3538
IOptionsMonitor<ScriptApplicationHostOptions> scriptOptions,
@@ -45,6 +48,8 @@ public WorkerFunctionMetadataProvider(
4548
_scriptHostManager = scriptHostManager;
4649
_workerRuntime = _environment.GetEnvironmentVariable(EnvironmentSettingNames.FunctionWorkerRuntime);
4750
_dateTimeSerializerSettings = new JsonSerializerSettings { DateParseHandling = DateParseHandling.None };
51+
52+
_scriptHostManager.ActiveHostChanged += OnHostChanged;
4853
}
4954

5055
public ImmutableDictionary<string, ImmutableArray<string>> FunctionErrors
@@ -83,19 +88,14 @@ public async Task<FunctionMetadataResult> GetFunctionMetadataAsync(IEnumerable<R
8388
// Start up GRPC channels if they are not already running.
8489
if (channels?.Any() != true)
8590
{
86-
if (_scriptHostManager.State is ScriptHostState.Default
87-
|| _scriptHostManager.State is ScriptHostState.Starting
88-
|| _scriptHostManager.State is ScriptHostState.Initialized)
91+
if (IsJobHostStarting())
8992
{
90-
// We don't need to restart if the host hasn't even been created yet.
91-
_logger.LogDebug("Host is starting up, initializing language worker channel");
93+
_logger.LogDebug("JobHost is starting with state '{State}'. Initializing worker channel.", _scriptHostManager.State);
9294
await _channelManager.InitializeChannelAsync(workerConfigs, _workerRuntime);
9395
}
9496
else
9597
{
96-
// During the restart flow, GetFunctionMetadataAsync gets invoked
97-
// again through a new script host initialization flow.
98-
_logger.LogDebug("Host is running without any initialized channels, restarting the JobHost.");
98+
_logger.LogDebug("JobHost has started and has state '{State}' without any worker channels. Restarting host to reinitialize.", _scriptHostManager.State);
9999
await _scriptHostManager.RestartHostAsync();
100100
}
101101

@@ -149,6 +149,38 @@ public async Task<FunctionMetadataResult> GetFunctionMetadataAsync(IEnumerable<R
149149
return new FunctionMetadataResult(useDefaultMetadataIndexing: false, _functions);
150150
}
151151

152+
private void OnHostChanged(object sender, ActiveHostChangedEventArgs args)
153+
{
154+
// Track the current host so we can get state later if needed.
155+
_currentJobHost = args.NewHost;
156+
}
157+
158+
private bool IsJobHostStarting()
159+
{
160+
if (_scriptHostManager.State is ScriptHostState.Default
161+
|| _scriptHostManager.State is ScriptHostState.Starting
162+
|| _scriptHostManager.State is ScriptHostState.Initialized)
163+
{
164+
return true;
165+
}
166+
167+
// The Error state can occur when the host is in a "final" state after completely starting,
168+
// or during a retry of a transient error. This check allows us to determine the difference. If
169+
// the host has not completely started, it means that it is still in the process of starting.
170+
if (_currentJobHost is not null && _scriptHostManager.State == ScriptHostState.Error)
171+
{
172+
var lifetime = _currentJobHost.Services?.GetService<IHostApplicationLifetime>();
173+
174+
if (lifetime is not null &&
175+
!lifetime.ApplicationStarted.IsCancellationRequested)
176+
{
177+
return true;
178+
}
179+
}
180+
181+
return false;
182+
}
183+
152184
internal void ValidateFunctionAppFormat(string scriptPath, ILogger logger, IEnvironment environment, IFileSystem fileSystem = null)
153185
{
154186
fileSystem = fileSystem ?? FileUtility.Instance;
@@ -298,5 +330,10 @@ private bool IsNullOrEmpty(IEnumerable<RawFunctionMetadata> functions)
298330
}
299331
return false;
300332
}
333+
334+
public void Dispose()
335+
{
336+
_scriptHostManager.ActiveHostChanged -= OnHostChanged;
337+
}
301338
}
302339
}

test/WebJobs.Script.Tests.Integration/Host/WebJobsScriptHostServiceTests.cs

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
using Microsoft.Azure.WebJobs.Description;
1515
using Microsoft.Azure.WebJobs.Host.Config;
1616
using Microsoft.Azure.WebJobs.Script.Scale;
17+
using Microsoft.Azure.WebJobs.Script.Tests.Integration.WebHostEndToEnd;
1718
using Microsoft.Azure.WebJobs.Script.WebHost;
1819
using Microsoft.Azure.WebJobs.Script.WebHost.Authentication;
1920
using Microsoft.Extensions.DependencyInjection;
@@ -354,23 +355,6 @@ public void Dispose()
354355
_testHost?.Dispose();
355356
}
356357

357-
private class InterceptingScriptHostBuilder : IScriptHostBuilder
358-
{
359-
private readonly DefaultScriptHostBuilder _builder;
360-
private readonly Func<IScriptHostBuilder, bool, bool, IHost> _interceptCallback;
361-
362-
public InterceptingScriptHostBuilder(IOptionsMonitor<ScriptApplicationHostOptions> appHostOptions, IServiceProvider rootServiceProvider, IServiceCollection rootServices, Func<IScriptHostBuilder, bool, bool, IHost> interceptCallback)
363-
{
364-
_builder = new DefaultScriptHostBuilder(appHostOptions, rootServices, rootServiceProvider);
365-
_interceptCallback = interceptCallback;
366-
}
367-
368-
public IHost BuildHost(bool skipHostStartup, bool skipHostConfigurationParsing)
369-
{
370-
return _interceptCallback(_builder, skipHostStartup, skipHostConfigurationParsing);
371-
}
372-
}
373-
374358
[Extension("TestWebHook", "TestWebHook")]
375359
private class TestWebHookExtension : IExtensionConfigProvider, IAsyncConverter<HttpRequestMessage, HttpResponseMessage>
376360
{
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
using System;
2+
using Microsoft.Azure.WebJobs.Script.WebHost;
3+
using Microsoft.Extensions.DependencyInjection;
4+
using Microsoft.Extensions.Hosting;
5+
using Microsoft.Extensions.Options;
6+
7+
namespace Microsoft.Azure.WebJobs.Script.Tests.Integration.WebHostEndToEnd;
8+
9+
internal class InterceptingScriptHostBuilder : IScriptHostBuilder
10+
{
11+
private readonly DefaultScriptHostBuilder _builder;
12+
private readonly Func<IScriptHostBuilder, bool, bool, IHost> _interceptCallback;
13+
14+
public InterceptingScriptHostBuilder(IOptionsMonitor<ScriptApplicationHostOptions> appHostOptions, IServiceProvider rootServiceProvider, IServiceCollection rootServices, Func<IScriptHostBuilder, bool, bool, IHost> interceptCallback)
15+
{
16+
_builder = new DefaultScriptHostBuilder(appHostOptions, rootServices, rootServiceProvider);
17+
_interceptCallback = interceptCallback;
18+
}
19+
20+
public IHost BuildHost(bool skipHostStartup, bool skipHostConfigurationParsing)
21+
{
22+
return _interceptCallback(_builder, skipHostStartup, skipHostConfigurationParsing);
23+
}
24+
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// Copyright (c) .NET Foundation. All rights reserved.
2+
// Licensed under the MIT License. See License.txt in the project root for license information.
3+
4+
using System;
5+
using System.Linq;
6+
using System.Threading.Tasks;
7+
using Microsoft.Azure.WebJobs.Script.WebHost;
8+
using Microsoft.Extensions.DependencyInjection;
9+
using Microsoft.Extensions.Hosting;
10+
using Microsoft.Extensions.Options;
11+
using Xunit;
12+
13+
namespace Microsoft.Azure.WebJobs.Script.Tests.Integration.WebHostEndToEnd;
14+
15+
public class WebHostStartupEndToEndTests
16+
{
17+
[Fact]
18+
public async Task TransientError_DuringHostBuild_DoesNotDeadlock()
19+
{
20+
bool thrownOnce = false;
21+
void ThrowOnFirstBuild()
22+
{
23+
if (!thrownOnce)
24+
{
25+
// Simulate a transient error during first host build
26+
thrownOnce = true;
27+
throw new InvalidOperationException("Simulated transient error during host build.");
28+
}
29+
}
30+
31+
var fixture = new WebHostStartupEndToEndTestFixture(ThrowOnFirstBuild);
32+
33+
try
34+
{
35+
await fixture.InitializeAsync();
36+
37+
// This should recover as the second call to BuildHost should succeed
38+
await TestHelpers.Await(async () =>
39+
{
40+
var result = await fixture.Host.HttpClient.GetAsync("/api/HttpRequestDataFunction");
41+
return result.IsSuccessStatusCode && await result.Content.ReadAsStringAsync() == "Welcome to Azure Functions!";
42+
43+
}, 10000, userMessageCallback: fixture.Host.GetLog);
44+
45+
var debugMsg = fixture.Host.GetWebHostLogMessages("Microsoft.Azure.WebJobs.Script.WorkerFunctionMetadataProvider")
46+
.Where(m => m.Level == Microsoft.Extensions.Logging.LogLevel.Debug)
47+
.Where(m => m.FormattedMessage.StartsWith("JobHost is starting with state"));
48+
Assert.Single(debugMsg);
49+
Assert.Contains("'Error'", debugMsg.Single().FormattedMessage);
50+
}
51+
finally
52+
{
53+
await fixture.DisposeAsync();
54+
}
55+
}
56+
57+
private class WebHostStartupEndToEndTestFixture : EndToEndTestFixture
58+
{
59+
private readonly Action _scriptHostBuildInterceptor;
60+
61+
public WebHostStartupEndToEndTestFixture(Action scriptHostBuildInterceptor = null)
62+
: base(@"..\..\DotNetIsolated60\debug", "WebHostStartupEndToEndTests", "dotnet-isolated")
63+
{
64+
_scriptHostBuildInterceptor = scriptHostBuildInterceptor;
65+
}
66+
67+
protected override Task CreateTestStorageEntities() => Task.CompletedTask;
68+
69+
public override void ConfigureWebHost(IServiceCollection rootServices)
70+
{
71+
rootServices.AddSingleton<IScriptHostBuilder>(rootProvider =>
72+
{
73+
var appHostOptions = rootProvider.GetService<IOptionsMonitor<ScriptApplicationHostOptions>>();
74+
75+
IHost Intercept(IScriptHostBuilder builder, bool skipHostStartup, bool skipHostConfigurationParsing)
76+
{
77+
_scriptHostBuildInterceptor?.Invoke();
78+
79+
return builder.BuildHost(skipHostStartup, skipHostConfigurationParsing);
80+
}
81+
82+
return new InterceptingScriptHostBuilder(appHostOptions, rootProvider, rootServices, Intercept);
83+
});
84+
}
85+
}
86+
}

0 commit comments

Comments
 (0)