Skip to content

Commit f4a9393

Browse files
committed
Adding exponential backoff to error/retry (#1162)
1 parent 4b8f72a commit f4a9393

File tree

9 files changed

+190
-31
lines changed

9 files changed

+190
-31
lines changed

src/WebJobs.Script/Config/ScriptHostConfiguration.cs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ public ScriptHostConfiguration()
1818
FileLoggingMode = FileLoggingMode.Never;
1919
RootScriptPath = Environment.CurrentDirectory;
2020
RootLogPath = Path.Combine(Path.GetTempPath(), "Functions");
21-
RestartInterval = TimeSpan.FromSeconds(5);
2221
}
2322

2423
/// <summary>
@@ -89,11 +88,5 @@ public ScriptHostConfiguration()
8988
/// there is no timeout duration.
9089
/// </summary>
9190
public TimeSpan? FunctionTimeout { get; set; }
92-
93-
/// <summary>
94-
/// Gets or sets the restart interval to use between restart attempts
95-
/// when the host is in an error state. The default is 5 seconds.
96-
/// </summary>
97-
public TimeSpan RestartInterval { get; set; }
9891
}
9992
}

src/WebJobs.Script/GlobalSuppressions.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,4 +126,7 @@
126126
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1704:IdentifiersShouldBeSpelledCorrectly", MessageId = "Sku", Scope = "member", Target = "Microsoft.Azure.WebJobs.Script.ScriptConstants.#DynamicSku")]
127127
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1704:IdentifiersShouldBeSpelledCorrectly", MessageId = "Sku", Scope = "member", Target = "Microsoft.Azure.WebJobs.Script.Config.ScriptSettingsManager.#IsDynamicSku")]
128128
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Reliability", "CA2000:Dispose objects before losing scope", Scope = "member", Target = "Microsoft.Azure.WebJobs.Script.ScriptHost.#Create(Microsoft.Azure.WebJobs.Script.ScriptHostConfiguration,Microsoft.Azure.WebJobs.Script.Config.ScriptSettingsManager)")]
129+
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Usage", "CA2213:DisposableFieldsShouldBeDisposed", MessageId = "_restartDelayTokenSource", Scope = "member", Target = "Microsoft.Azure.WebJobs.Script.ScriptHostManager.#Dispose(System.Boolean)")]
130+
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Usage", "CA2213:DisposableFieldsShouldBeDisposed", MessageId = "_scriptFileWatcher", Scope = "member", Target = "Microsoft.Azure.WebJobs.Script.ScriptHostManager.#Dispose(System.Boolean)")]
131+
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1704:IdentifiersShouldBeSpelledCorrectly", MessageId = "Backoff", Scope = "member", Target = "Microsoft.Azure.WebJobs.Script.Utility.#DelayWithBackoffAsync(System.Int32,System.Threading.CancellationToken,System.Nullable`1<System.TimeSpan>,System.Nullable`1<System.TimeSpan>,System.Nullable`1<System.TimeSpan>)")]
129132

src/WebJobs.Script/Host/ScriptHostManager.cs

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
using System.Collections.Generic;
66
using System.Diagnostics;
77
using System.Globalization;
8+
using System.IO;
89
using System.Linq;
910
using System.Text;
1011
using System.Threading;
1112
using System.Threading.Tasks;
1213
using Microsoft.Azure.WebJobs.Host;
1314
using Microsoft.Azure.WebJobs.Script.Config;
1415
using Microsoft.Azure.WebJobs.Script.Diagnostics;
16+
using Microsoft.Azure.WebJobs.Script.IO;
1517

1618
namespace Microsoft.Azure.WebJobs.Script
1719
{
@@ -37,10 +39,17 @@ public class ScriptHostManager : IDisposable
3739
private TraceWriter _traceWriter;
3840

3941
private ScriptSettingsManager _settingsManager;
42+
private AutoRecoveringFileSystemWatcher _scriptFileWatcher;
43+
private CancellationTokenSource _restartDelayTokenSource;
4044

4145
public ScriptHostManager(ScriptHostConfiguration config)
4246
: this(config, ScriptSettingsManager.Instance, new ScriptHostFactory())
4347
{
48+
if (config.FileWatchingEnabled)
49+
{
50+
_scriptFileWatcher = new AutoRecoveringFileSystemWatcher(config.RootScriptPath);
51+
_scriptFileWatcher.Changed += OnScriptFileChanged;
52+
}
4453
}
4554

4655
public ScriptHostManager(ScriptHostConfiguration config, ScriptSettingsManager settingsManager, IScriptHostFactory scriptHostFactory)
@@ -83,8 +92,7 @@ public bool CanInvoke()
8392

8493
public void RunAndBlock(CancellationToken cancellationToken = default(CancellationToken))
8594
{
86-
// Start the host and restart it if requested. Host Restarts will happen when
87-
// host level configuration files change
95+
int consecutiveErrorCount = 0;
8896
do
8997
{
9098
ScriptHost newInstance = null;
@@ -116,8 +124,8 @@ public bool CanInvoke()
116124

117125
if (_traceWriter != null)
118126
{
119-
string message = string.Format("Starting Host (HostId={0}, Version={1}, ProcessId={2}, Debug={3})",
120-
newInstance.ScriptConfig.HostConfig.HostId, ScriptHost.Version, Process.GetCurrentProcess().Id, newInstance.InDebugMode.ToString());
127+
string message = string.Format("Starting Host (HostId={0}, Version={1}, ProcessId={2}, Debug={3}, Attempt={4})",
128+
newInstance.ScriptConfig.HostConfig.HostId, ScriptHost.Version, Process.GetCurrentProcess().Id, newInstance.InDebugMode.ToString(), consecutiveErrorCount);
121129
_traceWriter.Info(message);
122130
}
123131
newInstance.StartAsync(cancellationToken).GetAwaiter().GetResult();
@@ -131,6 +139,8 @@ public bool CanInvoke()
131139
// state to Running
132140
State = ScriptHostState.Running;
133141
LastError = null;
142+
consecutiveErrorCount = 0;
143+
_restartDelayTokenSource = null;
134144

135145
// Wait for a restart signal. This event will automatically reset.
136146
// While we're restarting, it is possible for another restart to be
@@ -155,6 +165,7 @@ public bool CanInvoke()
155165
{
156166
State = ScriptHostState.Error;
157167
LastError = ex;
168+
consecutiveErrorCount++;
158169

159170
// We need to keep the host running, so we catch and log any errors
160171
// then restart the host
@@ -177,15 +188,22 @@ public bool CanInvoke()
177188
}, TaskContinuationOptions.ExecuteSynchronously);
178189
}
179190

180-
// Wait for a short period of time before restarting to
181-
// avoid cases where a host level config error might cause
182-
// a rapid restart cycle
183-
Task.Delay(_config.RestartInterval).GetAwaiter().GetResult();
191+
// attempt restarts using an exponential backoff strategy
192+
CreateRestartBackoffDelay(consecutiveErrorCount).GetAwaiter().GetResult();
184193
}
185194
}
186195
while (!_stopped && !cancellationToken.IsCancellationRequested);
187196
}
188197

198+
private Task CreateRestartBackoffDelay(int consecutiveErrorCount)
199+
{
200+
_restartDelayTokenSource = new CancellationTokenSource();
201+
202+
return Utility.DelayWithBackoffAsync(consecutiveErrorCount, _restartDelayTokenSource.Token,
203+
min: TimeSpan.FromSeconds(1),
204+
max: TimeSpan.FromMinutes(2));
205+
}
206+
189207
private static void LogErrors(ScriptHost host)
190208
{
191209
if (host.FunctionErrors.Count > 0)
@@ -234,6 +252,19 @@ private async Task Orphan(ScriptHost instance, bool forceStop = false)
234252
}
235253
}
236254

255+
/// <summary>
256+
/// Handler for any file changes under the root script path
257+
/// </summary>
258+
private void OnScriptFileChanged(object sender, FileSystemEventArgs e)
259+
{
260+
if (_restartDelayTokenSource != null && !_restartDelayTokenSource.IsCancellationRequested)
261+
{
262+
// if we're currently awaiting an error/restart delay
263+
// cancel that
264+
_restartDelayTokenSource.Cancel();
265+
}
266+
}
267+
237268
public async Task StopAsync()
238269
{
239270
_stopped = true;
@@ -326,6 +357,8 @@ protected virtual void Dispose(bool disposing)
326357
}
327358

328359
_stopEvent.Dispose();
360+
_restartDelayTokenSource?.Dispose();
361+
_scriptFileWatcher?.Dispose();
329362

330363
_disposed = true;
331364
}

src/WebJobs.Script/IO/AutoRecoveringFileSystemWatcher.cs

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -131,23 +131,16 @@ private void Trace(string message, TraceLevel level)
131131

132132
private async Task Recover(int attempt = 1)
133133
{
134-
// Exponential backoff on retries
135-
long wait = Convert.ToInt64((Math.Pow(2, attempt) - 1) / 2);
136-
137-
// maximum wait of 5 minutes
138-
wait = Math.Min(wait, 300);
139-
140-
if (wait > 0)
141-
{
142-
Trace($"Next recovery attempt in {wait} seconds...", TraceLevel.Warning);
143-
await Task.Delay(TimeSpan.FromSeconds(wait)).ConfigureAwait(false);
144-
}
134+
// Exponential backoff on retries with
135+
// a maximum wait of 5 minutes
136+
await Utility.DelayWithBackoffAsync(attempt, _cancellationToken, max: TimeSpan.FromMinutes(5));
145137

146138
// Check if cancellation was requested while we were waiting
147139
_cancellationToken.ThrowIfCancellationRequested();
148140

149141
try
150142
{
143+
Trace($"Attempting to recover...", TraceLevel.Warning);
151144
ReleaseCurrentFileWatcher();
152145
InitializeWatcher();
153146
}

src/WebJobs.Script/Utility.cs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
using System.Globalization;
88
using System.Linq;
99
using System.Text;
10+
using System.Threading;
11+
using System.Threading.Tasks;
1012
using Microsoft.Azure.WebJobs.Script.Config;
1113
using Newtonsoft.Json;
1214
using Newtonsoft.Json.Converters;
@@ -19,6 +21,59 @@ public static class Utility
1921
private static readonly ExpandoObjectConverter _expandoObjectJsonConverter = new ExpandoObjectConverter();
2022
private static readonly string UTF8ByteOrderMark = Encoding.UTF8.GetString(Encoding.UTF8.GetPreamble());
2123

24+
/// <summary>
25+
/// Implements a configurable exponential backoff strategy.
26+
/// </summary>
27+
/// <param name="exponent">The backoff exponent. E.g. for backing off in a retry loop,
28+
/// this would be the retry attempt count.</param>
29+
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to use.</param>
30+
/// <param name="unit">The time unit for the backoff delay. Default is 1 second, producing a backoff sequence
31+
/// of 2, 4, 8, 16, etc. seconds.</param>
32+
/// <param name="min">The minimum delay.</param>
33+
/// <param name="max">The maximum delay.</param>
34+
/// <returns>A <see cref="Task"/> representing the computed backoff interval.</returns>
35+
public async static Task DelayWithBackoffAsync(int exponent, CancellationToken cancellationToken, TimeSpan? unit = null, TimeSpan? min = null, TimeSpan? max = null)
36+
{
37+
TimeSpan delay = ComputeBackoff(exponent, unit, min, max);
38+
39+
if (delay.TotalMilliseconds > 0)
40+
{
41+
try
42+
{
43+
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
44+
}
45+
catch (OperationCanceledException)
46+
{
47+
// when the delay is cancelled it is not an error
48+
}
49+
}
50+
}
51+
52+
internal static TimeSpan ComputeBackoff(int exponent, TimeSpan? unit = null, TimeSpan? min = null, TimeSpan? max = null)
53+
{
54+
// determine the exponential backoff factor
55+
long backoffFactor = Convert.ToInt64((Math.Pow(2, exponent) - 1) / 2);
56+
57+
// compute the backoff delay
58+
unit = unit ?? TimeSpan.FromSeconds(1);
59+
long totalDelayTicks = backoffFactor * unit.Value.Ticks;
60+
TimeSpan delay = TimeSpan.FromTicks(totalDelayTicks);
61+
62+
// apply minimum restriction
63+
if (min.HasValue && delay < min)
64+
{
65+
delay = min.Value;
66+
}
67+
68+
// apply maximum restriction
69+
if (max.HasValue && delay > max)
70+
{
71+
delay = max.Value;
72+
}
73+
74+
return delay;
75+
}
76+
2277
public static string GetSubscriptionId()
2378
{
2479
string ownerName = ScriptSettingsManager.Instance.GetSetting(EnvironmentSettingNames.AzureWebsiteOwnerName) ?? string.Empty;

test/WebJobs.Script.Tests.Integration/Host/ScriptHostManagerTests.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Licensed under the MIT License. See License.txt in the project root for license information.
33

44
using System;
5+
using System.Diagnostics;
56
using System.IO;
67
using System.Linq;
78
using System.Runtime.ExceptionServices;

test/WebJobs.Script.Tests.Integration/Host/WebScriptHostManagerTests.cs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,6 @@ public async Task MultipleHostRestarts()
148148
RootLogPath = logDir,
149149
RootScriptPath = functionTestDir,
150150
FileLoggingMode = FileLoggingMode.Always,
151-
RestartInterval = TimeSpan.FromMilliseconds(500)
152151
};
153152

154153
ISecretsRepository repository = new FileSystemSecretsRepository(secretsDir);
@@ -357,7 +356,7 @@ public Fixture()
357356
"Info Host lock lease acquired by instance ID '(.+)'",
358357
"Info Function 'Excluded' is marked as excluded",
359358
@"Info Generating ([0-9]+) job function\(s\)",
360-
@"Info Starting Host \(HostId=function-tests-node, Version=(.+), ProcessId=[0-9]+, Debug=False\)",
359+
@"Info Starting Host \(HostId=function-tests-node, Version=(.+), ProcessId=[0-9]+, Debug=False, Attempt=0\)",
361360
"Info WebJobs.Indexing Found the following functions:",
362361
"Info The next 5 occurrences of the schedule will be:",
363362
"Info WebJobs.Host Job host started",

test/WebJobs.Script.Tests/IO/AutoRecoveringFileSystemWatcherTests.cs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,15 @@ public async Task RecoveryTest(int expectedNumberOfAttempts, bool isFailureScena
8383
int expectedTracesBeforeRecovery = (expectedNumberOfAttempts * 2) - 1;
8484
// Before + recovery trace
8585
int expectedTracesAfterRecovery = expectedTracesBeforeRecovery + 1;
86+
if (!isFailureScenario)
87+
{
88+
expectedTracesAfterRecovery++;
89+
}
8690

87-
await TestHelpers.Await(() => traceWriter.Traces.Count == expectedTracesBeforeRecovery, pollingInterval: 500);
91+
await TestHelpers.Await(() =>
92+
{
93+
return traceWriter.Traces.Count == expectedTracesBeforeRecovery;
94+
}, pollingInterval: 500);
8895

8996
if (isFailureScenario)
9097
{
@@ -95,14 +102,22 @@ public async Task RecoveryTest(int expectedNumberOfAttempts, bool isFailureScena
95102
Directory.CreateDirectory(directory.Path);
96103
}
97104

98-
await TestHelpers.Await(() => traceWriter.Traces.Count == expectedTracesAfterRecovery, pollingInterval: 500);
105+
await TestHelpers.Await(() =>
106+
{
107+
return traceWriter.Traces.Count == expectedTracesAfterRecovery;
108+
}, pollingInterval: 500);
99109

100110
TraceEvent failureEvent = traceWriter.Traces.First();
101111
var retryEvents = traceWriter.Traces.Where(t => t.Level == TraceLevel.Warning).Skip(1).ToList();
102112

103113
Assert.Equal(TraceLevel.Warning, failureEvent.Level);
104114
Assert.Contains("Failure detected", failureEvent.Message);
105-
Assert.Equal(expectedNumberOfAttempts - 1, retryEvents.Count);
115+
int expectedRetryEvents = expectedNumberOfAttempts;
116+
if (isFailureScenario)
117+
{
118+
expectedRetryEvents--;
119+
}
120+
Assert.Equal(expectedRetryEvents, retryEvents.Count);
106121

107122
// Validate that our the events happened with the expected intervals
108123
DateTime previoustTimeStamp = failureEvent.Timestamp;

0 commit comments

Comments
 (0)