Skip to content

Commit c11900f

Browse files
committed
Move failure counters
1 parent c78d249 commit c11900f

File tree

8 files changed

+167
-143
lines changed

8 files changed

+167
-143
lines changed

src/ServiceControl.Audit/Auditing/AuditIngestion.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public AuditIngestion(
2727
AuditIngestor auditIngestor,
2828
IAuditIngestionUnitOfWorkFactory unitOfWorkFactory,
2929
IHostApplicationLifetime applicationLifetime,
30-
AuditIngestionMetrics metrics)
30+
IngestionMetrics metrics)
3131
{
3232
inputEndpoint = settings.AuditQueue;
3333
this.transportCustomization = transportCustomization;
@@ -53,7 +53,7 @@ public AuditIngestion(
5353
FullMode = BoundedChannelFullMode.Wait
5454
});
5555

56-
errorHandlingPolicy = new AuditIngestionFaultPolicy(failedImportsStorage, settings.LoggingSettings, OnCriticalError);
56+
errorHandlingPolicy = new AuditIngestionFaultPolicy(failedImportsStorage, settings.LoggingSettings, OnCriticalError, metrics);
5757

5858
watchdog = new Watchdog(
5959
"audit message ingestion",
@@ -314,7 +314,7 @@ public override async Task StopAsync(CancellationToken cancellationToken)
314314
readonly Channel<MessageContext> channel;
315315
readonly Watchdog watchdog;
316316
readonly IHostApplicationLifetime applicationLifetime;
317-
readonly AuditIngestionMetrics metrics;
317+
readonly IngestionMetrics metrics;
318318

319319
static readonly ILog logger = LogManager.GetLogger<AuditIngestion>();
320320

Lines changed: 87 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,115 +1,113 @@
1-
namespace ServiceControl.Audit.Auditing
1+
namespace ServiceControl.Audit.Auditing;
2+
3+
using System;
4+
using System.Diagnostics;
5+
using System.IO;
6+
using System.Runtime.InteropServices;
7+
using System.Runtime.Versioning;
8+
using System.Threading;
9+
using System.Threading.Tasks;
10+
using Infrastructure;
11+
using NServiceBus.Logging;
12+
using NServiceBus.Transport;
13+
using Persistence;
14+
using Configuration;
15+
using Metrics;
16+
using ServiceControl.Infrastructure;
17+
18+
class AuditIngestionFaultPolicy
219
{
3-
using System;
4-
using System.Diagnostics;
5-
using System.Diagnostics.Metrics;
6-
using System.IO;
7-
using System.Runtime.InteropServices;
8-
using System.Runtime.Versioning;
9-
using System.Threading;
10-
using System.Threading.Tasks;
11-
using Infrastructure;
12-
using NServiceBus.Logging;
13-
using NServiceBus.Transport;
14-
using Persistence;
15-
using Configuration;
16-
using ServiceControl.Infrastructure;
17-
18-
class AuditIngestionFaultPolicy
20+
public AuditIngestionFaultPolicy(IFailedAuditStorage failedAuditStorage, LoggingSettings settings, Func<string, Exception, Task> onCriticalError, IngestionMetrics metrics)
1921
{
20-
readonly IFailedAuditStorage failedAuditStorage;
21-
readonly string logPath;
22-
readonly ImportFailureCircuitBreaker failureCircuitBreaker;
22+
failureCircuitBreaker = new ImportFailureCircuitBreaker(onCriticalError);
23+
this.failedAuditStorage = failedAuditStorage;
24+
this.metrics = metrics;
2325

24-
public AuditIngestionFaultPolicy(IFailedAuditStorage failedAuditStorage, LoggingSettings settings, Func<string, Exception, Task> onCriticalError)
26+
if (!AppEnvironment.RunningInContainer)
2527
{
26-
failureCircuitBreaker = new ImportFailureCircuitBreaker(onCriticalError);
27-
this.failedAuditStorage = failedAuditStorage;
28-
29-
if (!AppEnvironment.RunningInContainer)
30-
{
31-
logPath = Path.Combine(settings.LogPath, @"FailedImports\Audit");
32-
Directory.CreateDirectory(logPath);
33-
}
28+
logPath = Path.Combine(settings.LogPath, @"FailedImports\Audit");
29+
Directory.CreateDirectory(logPath);
3430
}
31+
}
3532

36-
public async Task<ErrorHandleResult> OnError(ErrorContext errorContext, CancellationToken cancellationToken = default)
37-
{
38-
var tags = Telemetry.GetIngestedMessageTags(errorContext.Message.Headers, errorContext.Message.Body);
33+
public async Task<ErrorHandleResult> OnError(ErrorContext errorContext, CancellationToken cancellationToken = default)
34+
{
35+
using var errorMetrics = metrics.BeginErrorHandling(errorContext);
3936

40-
//Same as recoverability policy in NServiceBusFactory
41-
if (errorContext.ImmediateProcessingFailures < 3)
42-
{
43-
retryCounter.Add(1, tags);
44-
return ErrorHandleResult.RetryRequired;
45-
}
37+
//Same as recoverability policy in NServiceBusFactory
38+
if (errorContext.ImmediateProcessingFailures < 3)
39+
{
40+
errorMetrics.Retry();
41+
return ErrorHandleResult.RetryRequired;
42+
}
4643

47-
await StoreFailedMessageDocument(errorContext, cancellationToken);
44+
await StoreFailedMessageDocument(errorContext, cancellationToken);
4845

49-
failedCounter.Add(1, tags);
46+
//failedCounter.Add(1, tags);
5047

51-
return ErrorHandleResult.Handled;
52-
}
48+
return ErrorHandleResult.Handled;
49+
}
5350

54-
async Task StoreFailedMessageDocument(ErrorContext errorContext, CancellationToken cancellationToken)
51+
async Task StoreFailedMessageDocument(ErrorContext errorContext, CancellationToken cancellationToken)
52+
{
53+
var failure = new FailedAuditImport
5554
{
56-
var failure = new FailedAuditImport
57-
{
58-
Id = Guid.NewGuid().ToString(),
59-
Message = new FailedTransportMessage
60-
{
61-
Id = errorContext.Message.MessageId,
62-
Headers = errorContext.Message.Headers,
63-
// At the moment we are taking a defensive copy of the body to avoid issues with the message body
64-
// buffers being returned to the pool and potentially being overwritten. Once we know how RavenDB
65-
// handles byte[] to ReadOnlyMemory<byte> conversion we might be able to remove this.
66-
Body = errorContext.Message.Body.ToArray()
67-
},
68-
ExceptionInfo = errorContext.Exception.ToFriendlyString()
69-
};
70-
71-
try
72-
{
73-
await DoLogging(errorContext.Exception, failure, cancellationToken);
74-
}
75-
finally
55+
Id = Guid.NewGuid().ToString(),
56+
Message = new FailedTransportMessage
7657
{
77-
failureCircuitBreaker.Increment(errorContext.Exception);
78-
}
58+
Id = errorContext.Message.MessageId,
59+
Headers = errorContext.Message.Headers,
60+
// At the moment we are taking a defensive copy of the body to avoid issues with the message body
61+
// buffers being returned to the pool and potentially being overwritten. Once we know how RavenDB
62+
// handles byte[] to ReadOnlyMemory<byte> conversion we might be able to remove this.
63+
Body = errorContext.Message.Body.ToArray()
64+
},
65+
ExceptionInfo = errorContext.Exception.ToFriendlyString()
66+
};
67+
68+
try
69+
{
70+
await DoLogging(errorContext.Exception, failure, cancellationToken);
7971
}
80-
81-
async Task DoLogging(Exception exception, FailedAuditImport failure, CancellationToken cancellationToken)
72+
finally
8273
{
83-
log.Error("Failed importing error message", exception);
74+
failureCircuitBreaker.Increment(errorContext.Exception);
75+
}
76+
}
77+
78+
async Task DoLogging(Exception exception, FailedAuditImport failure, CancellationToken cancellationToken)
79+
{
80+
log.Error("Failed importing error message", exception);
8481

85-
// Write to storage
86-
await failedAuditStorage.SaveFailedAuditImport(failure);
82+
// Write to storage
83+
await failedAuditStorage.SaveFailedAuditImport(failure);
8784

88-
if (!AppEnvironment.RunningInContainer)
85+
if (!AppEnvironment.RunningInContainer)
86+
{
87+
// Write to Log Path
88+
var filePath = Path.Combine(logPath, failure.Id + ".txt");
89+
await File.WriteAllTextAsync(filePath, failure.ExceptionInfo, cancellationToken);
90+
91+
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
8992
{
90-
// Write to Log Path
91-
var filePath = Path.Combine(logPath, failure.Id + ".txt");
92-
await File.WriteAllTextAsync(filePath, failure.ExceptionInfo, cancellationToken);
93-
94-
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
95-
{
96-
WriteToEventLog("A message import has failed. A log file has been written to " + filePath);
97-
}
93+
WriteToEventLog("A message import has failed. A log file has been written to " + filePath);
9894
}
9995
}
96+
}
10097

101-
[SupportedOSPlatform("windows")]
102-
void WriteToEventLog(string message)
103-
{
98+
[SupportedOSPlatform("windows")]
99+
void WriteToEventLog(string message)
100+
{
104101
#if DEBUG
105-
EventSourceCreator.Create();
102+
EventSourceCreator.Create();
106103
#endif
107-
EventLog.WriteEntry(EventSourceCreator.SourceName, message, EventLogEntryType.Error);
108-
}
104+
EventLog.WriteEntry(EventSourceCreator.SourceName, message, EventLogEntryType.Error);
105+
}
109106

110-
readonly Counter<long> retryCounter = Telemetry.Meter.CreateCounter<long>(Telemetry.CreateInstrumentName("ingestion", "retry"), description: "Audit ingestion retries count");
111-
readonly Counter<long> failedCounter = Telemetry.Meter.CreateCounter<long>(Telemetry.CreateInstrumentName("ingestion", "failed"), description: "Audit ingestion failure count");
107+
readonly IFailedAuditStorage failedAuditStorage;
108+
readonly IngestionMetrics metrics;
109+
readonly string logPath;
110+
readonly ImportFailureCircuitBreaker failureCircuitBreaker;
112111

113-
static readonly ILog log = LogManager.GetLogger<AuditIngestionFaultPolicy>();
114-
}
112+
static readonly ILog log = LogManager.GetLogger<AuditIngestionFaultPolicy>();
115113
}

src/ServiceControl.Audit/Auditing/AuditIngestor.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
using System.Linq;
66
using System.Threading.Tasks;
77
using Infrastructure.Settings;
8-
using Metrics;
98
using Monitoring;
109
using NServiceBus;
1110
using NServiceBus.Logging;
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
namespace ServiceControl.Audit.Auditing.Metrics;
2+
3+
using System;
4+
using System.Diagnostics.Metrics;
5+
using NServiceBus.Transport;
6+
7+
public record ErrorMetrics(ErrorContext Context, Counter<long> Failures) : IDisposable
8+
{
9+
public void Dispose()
10+
{
11+
var tags = IngestionMetrics.GetMessageTags(Context.Message.Headers);
12+
13+
tags.Add("result", retry ? "retry" : "stored-poison");
14+
15+
Failures.Add(1, tags);
16+
}
17+
18+
public void Retry() => retry = true;
19+
20+
bool retry;
21+
}

src/ServiceControl.Audit/Auditing/Metrics/AuditIngestionMetrics.cs renamed to src/ServiceControl.Audit/Auditing/Metrics/IngestionMetrics.cs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,46 @@
11
namespace ServiceControl.Audit.Auditing.Metrics;
22

3+
using System.Collections.Generic;
4+
using System.Diagnostics;
35
using System.Diagnostics.Metrics;
6+
using EndpointPlugin.Messages.SagaState;
7+
using NServiceBus;
48
using NServiceBus.Transport;
59

6-
public class AuditIngestionMetrics
10+
public class IngestionMetrics
711
{
8-
public AuditIngestionMetrics(IMeterFactory meterFactory)
12+
public IngestionMetrics(IMeterFactory meterFactory)
913
{
1014
var meter = meterFactory.Create(MeterName, MeterVersion);
1115

1216
batchDuration = meter.CreateHistogram<double>(CreateInstrumentName("batch_duration"), unit: "ms", "Average audit message batch processing duration");
13-
consecutiveBatchFailureGauge = meter.CreateObservableGauge(CreateInstrumentName("consecutive_batch_failures"), () => consecutiveBatchFailures, unit: "count", description: "Consecutive audit ingestion batch failure");
17+
consecutiveBatchFailureGauge = meter.CreateObservableGauge(CreateInstrumentName("consecutive_batch_failures"), () => consecutiveBatchFailures, description: "Consecutive audit ingestion batch failure");
1418
ingestionDuration = meter.CreateHistogram<double>(CreateInstrumentName("duration"), unit: "ms", description: "Average incoming audit message processing duration");
19+
failureCounter = meter.CreateCounter<long>(CreateInstrumentName("failure_count"), description: "Audit ingestion failure count");
1520
}
1621

17-
public MessageIngestionMetrics BeginIngestion(MessageContext messageContext) => new(messageContext, ingestionDuration);
22+
public MessageMetrics BeginIngestion(MessageContext messageContext) => new(messageContext, ingestionDuration);
23+
24+
public ErrorMetrics BeginErrorHandling(ErrorContext errorContext) => new(errorContext, failureCounter);
1825

1926
public BatchMetrics BeginBatch(int maxBatchSize) => new(maxBatchSize, batchDuration, RecordBatchOutcome);
2027

28+
public static TagList GetMessageTags(Dictionary<string, string> headers)
29+
{
30+
var tags = new TagList();
31+
32+
if (headers.TryGetValue(Headers.EnclosedMessageTypes, out var messageType))
33+
{
34+
tags.Add("message.category", messageType == SagaUpdateMessageType ? "saga-update" : "audit-message");
35+
}
36+
else
37+
{
38+
tags.Add("message.category", "control-message");
39+
}
40+
41+
return tags;
42+
}
43+
2144
void RecordBatchOutcome(bool success)
2245
{
2346
if (success)
@@ -39,7 +62,10 @@ void RecordBatchOutcome(bool success)
3962
readonly ObservableGauge<long> consecutiveBatchFailureGauge;
4063
#pragma warning restore IDE0052
4164
readonly Histogram<double> ingestionDuration;
65+
readonly Counter<long> failureCounter;
4266

4367
const string MeterName = "Particular.ServiceControl.Audit";
4468
const string MeterVersion = "0.1.0";
69+
70+
static readonly string SagaUpdateMessageType = typeof(SagaUpdatedMessage).FullName;
4571
}

src/ServiceControl.Audit/Auditing/Metrics/MessageIngestionMetrics.cs

Lines changed: 0 additions & 45 deletions
This file was deleted.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
namespace ServiceControl.Audit.Auditing.Metrics;
2+
3+
using System;
4+
using System.Diagnostics;
5+
using System.Diagnostics.Metrics;
6+
using NServiceBus.Transport;
7+
8+
public record MessageMetrics(MessageContext Context, Histogram<double> Duration) : IDisposable
9+
{
10+
public void Skipped() => result = "skipped";
11+
12+
public void Success() => result = "success";
13+
14+
public void Dispose()
15+
{
16+
var tags = IngestionMetrics.GetMessageTags(Context.Headers);
17+
18+
tags.Add("result", result);
19+
Duration.Record(sw.ElapsedMilliseconds, tags);
20+
}
21+
22+
string result = "failed";
23+
24+
readonly Stopwatch sw = Stopwatch.StartNew();
25+
}

0 commit comments

Comments
 (0)