Skip to content

Commit 265424c

Browse files
authored
Only start flush logs timer when there's an event to write (#11100)
* Enable flush logs timer when there's an event to log * Set errors to warning * Disable service when there's any failure (even if transient) as part of flush logs, purge events or execute batch. * Check for _hostId first so we avoid lookups. * Adding release notes
1 parent aad007c commit 265424c

File tree

4 files changed

+75
-13
lines changed

4 files changed

+75
-13
lines changed

release_notes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@
1616
- Memory allocation optimizations in `FeatureFlags.IsEnabled` by adopting zero-allocation `ContainsToken` for efficient delimited token search (#11075)
1717
- Improvements to coldstart pipeline (#11102).
1818
- Update Python Worker Version to [4.38.0](https://github.com/Azure/azure-functions-python-worker/releases/tag/4.38.0)
19+
- Only start the Diagnostic Events flush logs timer when events are present, preventing unnecessary flush attempts (#11100).

src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.Log.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,16 @@ private static class Logger
3838
LoggerMessage.Define<string>(LogLevel.Debug, new EventId(6, nameof(DeletingTableWithOutdatedEventVersion)), "Deleting table '{tableName}' as it contains records with an outdated EventVersion.");
3939

4040
private static readonly Action<ILogger, Exception> _errorPurgingDiagnosticEventVersions =
41-
LoggerMessage.Define(LogLevel.Error, new EventId(7, nameof(ErrorPurgingDiagnosticEventVersions)), "Error occurred when attempting to purge previous diagnostic event versions.");
41+
LoggerMessage.Define(LogLevel.Warning, new EventId(7, nameof(ErrorPurgingDiagnosticEventVersions)), "Error occurred when attempting to purge previous diagnostic event versions.");
4242

4343
private static readonly Action<ILogger, Exception> _unableToGetTableReference =
44-
LoggerMessage.Define(LogLevel.Error, new EventId(8, nameof(UnableToGetTableReference)), "Unable to get table reference. Aborting write operation.");
44+
LoggerMessage.Define(LogLevel.Warning, new EventId(8, nameof(UnableToGetTableReference)), "Unable to get table reference. Aborting write operation.");
4545

4646
private static readonly Action<ILogger, Exception> _unableToGetTableReferenceOrCreateTable =
47-
LoggerMessage.Define(LogLevel.Error, new EventId(9, nameof(UnableToGetTableReferenceOrCreateTable)), "Unable to get table reference or create table. Aborting write operation.");
47+
LoggerMessage.Define(LogLevel.Warning, new EventId(9, nameof(UnableToGetTableReferenceOrCreateTable)), "Unable to get table reference or create table. Aborting write operation.");
4848

4949
private static readonly Action<ILogger, Exception> _unableToWriteDiagnosticEvents =
50-
LoggerMessage.Define(LogLevel.Error, new EventId(10, nameof(UnableToWriteDiagnosticEvents)), "Unable to write diagnostic events to table storage.");
50+
LoggerMessage.Define(LogLevel.Warning, new EventId(10, nameof(UnableToWriteDiagnosticEvents)), "Unable to write diagnostic events to table storage.");
5151

5252
private static readonly Action<ILogger, Exception> _primaryHostStateProviderNotAvailable =
5353
LoggerMessage.Define(LogLevel.Debug, new EventId(11, nameof(PrimaryHostStateProviderNotAvailable)), "PrimaryHostStateProvider is not available. Skipping the check for primary host.");

src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.cs

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public partial class DiagnosticEventTableStorageRepository : IDiagnosticEventRep
2626
private const int LogFlushInterval = 1000 * 60 * 10; // 10 minutes
2727
private const int TableCreationMaxRetryCount = 5;
2828

29-
private readonly Timer _flushLogsTimer;
29+
private readonly Lazy<Timer> _flushLogsTimer;
3030
private readonly IHostIdProvider _hostIdProvider;
3131
private readonly IEnvironment _environment;
3232
private readonly IAzureTableStorageProvider _azureTableStorageProvider;
@@ -50,7 +50,7 @@ internal DiagnosticEventTableStorageRepository(IHostIdProvider hostIdProvider, I
5050
_environment = environment;
5151
_serviceProvider = scriptHostManager as IServiceProvider;
5252
_logger = logger;
53-
_flushLogsTimer = new Timer(OnFlushLogs, null, logFlushInterval, logFlushInterval);
53+
_flushLogsTimer = new Lazy<Timer>(() => new Timer(OnFlushLogs, null, logFlushInterval, logFlushInterval));
5454
_azureTableStorageProvider = azureTableStorageProvider;
5555
}
5656

@@ -107,7 +107,7 @@ internal string HostId
107107
{
108108
get
109109
{
110-
if (!_environment.IsPlaceholderModeEnabled() && string.IsNullOrEmpty(_hostId))
110+
if (string.IsNullOrEmpty(_hostId) && !_environment.IsPlaceholderModeEnabled())
111111
{
112112
_hostId = _hostIdProvider?.GetHostIdAsync(CancellationToken.None).GetAwaiter().GetResult();
113113
}
@@ -192,9 +192,23 @@ await Utility.InvokeWithRetriesAsync(async () =>
192192

193193
_purged = true;
194194
}
195+
catch (RequestFailedException ex) when (ex.Status == (int)HttpStatusCode.Forbidden)
196+
{
197+
// If we reach this point, we already checked for permissions on TableClient initialization.
198+
// It is possible that the permissions changed after the initialization, any firewall/network rules were changed or it's a custom role where we don't have permissions to query entities.
199+
// We will log the error and disable the service.
200+
Logger.ErrorPurgingDiagnosticEventVersions(_logger, ex);
201+
DisableService();
202+
Logger.ServiceDisabledUnauthorizedClient(_logger, ex);
203+
}
195204
catch (Exception ex)
196205
{
206+
// We failed to connect to the table storage account. This could be due to a transient error or a configuration issue (e.g., network problems).
207+
// To avoid repeatedly retrying in a potentially unhealthy state, we will disable the service.
208+
// The operation may succeed in a future instance if the underlying issue is resolved.
197209
Logger.ErrorPurgingDiagnosticEventVersions(_logger, ex);
210+
DisableService();
211+
Logger.ServiceDisabledUnableToConnectToStorage(_logger, ex);
198212
}
199213
}, maxRetries: 5, retryInterval: TimeSpan.FromSeconds(5));
200214

@@ -245,12 +259,16 @@ internal virtual async Task FlushLogs(TableClient table = null)
245259
Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex);
246260
DisableService();
247261
Logger.ServiceDisabledUnauthorizedClient(_logger, ex);
262+
return;
248263
}
249264
catch (Exception ex)
250265
{
266+
// We failed to connect to the table storage account. This could be due to a transient error or a configuration issue (e.g., network problems).
267+
// To avoid repeatedly retrying in a potentially unhealthy state, we will disable the service.
268+
// The operation may succeed in a future instance if the underlying issue is resolved.
251269
Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex);
252-
// Clearing the memory cache to avoid memory build up.
253-
_events.Clear();
270+
DisableService();
271+
Logger.ServiceDisabledUnableToConnectToStorage(_logger, ex);
254272
return;
255273
}
256274

@@ -292,12 +310,14 @@ internal async Task ExecuteBatchAsync(ConcurrentDictionary<string, DiagnosticEve
292310
catch (Exception ex)
293311
{
294312
Logger.UnableToWriteDiagnosticEvents(_logger, ex);
313+
DisableService();
314+
Logger.ServiceDisabledUnableToConnectToStorage(_logger, ex);
295315
}
296316
}
297317

298318
public void WriteDiagnosticEvent(DateTime timestamp, string errorCode, LogLevel level, string message, string helpLink, Exception exception)
299319
{
300-
if (TableClient is null || string.IsNullOrEmpty(HostId))
320+
if (TableClient is null || string.IsNullOrEmpty(HostId) || !IsEnabled())
301321
{
302322
return;
303323
}
@@ -320,6 +340,18 @@ public void WriteDiagnosticEvent(DateTime timestamp, string errorCode, LogLevel
320340
_events[errorCode].HitCount++;
321341
}
322342
}
343+
344+
EnsureFlushLogsTimerInitialized();
345+
}
346+
347+
internal void EnsureFlushLogsTimerInitialized()
348+
{
349+
if (_disposed || !IsEnabled())
350+
{
351+
return;
352+
}
353+
354+
_ = _flushLogsTimer.Value;
323355
}
324356

325357
public bool IsEnabled()
@@ -341,8 +373,13 @@ private bool IsPrimaryHost()
341373

342374
private void StopTimer()
343375
{
376+
if (!_flushLogsTimer.IsValueCreated)
377+
{
378+
return;
379+
}
380+
344381
Logger.StoppingFlushLogsTimer(_logger);
345-
_flushLogsTimer?.Change(Timeout.Infinite, Timeout.Infinite);
382+
_flushLogsTimer?.Value?.Change(Timeout.Infinite, Timeout.Infinite);
346383
}
347384

348385
protected virtual void Dispose(bool disposing)
@@ -351,9 +388,9 @@ protected virtual void Dispose(bool disposing)
351388
{
352389
if (disposing)
353390
{
354-
if (_flushLogsTimer != null)
391+
if (_flushLogsTimer?.Value != null)
355392
{
356-
_flushLogsTimer.Dispose();
393+
_flushLogsTimer?.Value?.Dispose();
357394
}
358395

359396
FlushLogs().GetAwaiter().GetResult();

test/WebJobs.Script.Tests.Integration/Diagnostics/DiagnosticEventTableStorageRepositoryTests.cs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ public async Task TimerFlush_CalledOnExpectedInterval()
6969

7070
DiagnosticEventTableStorageRepository repository = mockDiagnosticEventTableStorageRepository.Object;
7171

72+
// Write a diagnostic event to trigger the timer
73+
mockDiagnosticEventTableStorageRepository.Object.WriteDiagnosticEvent(DateTime.UtcNow, "eh1", LogLevel.Information, "This is the message", "https://fwlink/", new Exception("exception message"));
74+
7275
int numFlushes = 0;
7376
mockDiagnosticEventTableStorageRepository.Protected().Setup("OnFlushLogs", ItExpr.IsAny<object>())
7477
.Callback<object>((state) =>
@@ -81,6 +84,27 @@ public async Task TimerFlush_CalledOnExpectedInterval()
8184
mockDiagnosticEventTableStorageRepository.VerifyAll();
8285
}
8386

87+
[Fact]
88+
public void TimerFlush_NotStartedWithRepositoryDisabled()
89+
{
90+
IEnvironment testEnvironment = new TestEnvironment();
91+
testEnvironment.SetEnvironmentVariable(EnvironmentSettingNames.AzureWebsitePlaceholderMode, "0");
92+
93+
var configuration = new ConfigurationBuilder().Build();
94+
var localStorageProvider = TestHelpers.GetAzureTableStorageProvider(configuration);
95+
DiagnosticEventTableStorageRepository repository =
96+
new DiagnosticEventTableStorageRepository(_hostIdProvider, testEnvironment, _scriptHostMock.Object, localStorageProvider, _logger);
97+
98+
repository.WriteDiagnosticEvent(DateTime.UtcNow, "eh1", LogLevel.Information, "This is the message", "https://fwlink/", new Exception("exception message"));
99+
100+
var flushLogsTimerField = typeof(DiagnosticEventTableStorageRepository)
101+
.GetField("_flushLogsTimer", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
102+
var lazyTimer = flushLogsTimerField.GetValue(repository) as Lazy<Timer>;
103+
104+
Assert.False(repository.IsEnabled());
105+
Assert.False(lazyTimer.IsValueCreated, "Timer should not be created when repository is disabled");
106+
}
107+
84108
[Fact]
85109
public void WriteDiagnostic_LogsError_WhenHostIdNotSet()
86110
{

0 commit comments

Comments
 (0)