Skip to content

Commit b8c6fbf

Browse files
Emit metrics via otel instead of custom format (#4762)
* Emit metrics via otel instead of custom format * Minimize diff by reverting whitespace changes * Approvals * Fix formatting * Apply suggestions from code review Co-authored-by: Ramon Smits <[email protected]> * fix formatting * Only use standar otel * Set instance id * Set unit * Better metrics names * Emit body size * Fix meter name * Log that OpenTelemetry metrics exporter is enabled * Stop using prefixes * Better name * Go back to using instance name as service name * better metrics names * More cleanup * Revert app.config * Add duration and failure counters * Add consecutive batch failures * Fix namespace for metrics * Skip interlocked increment * Added descriptions for metric instruments * Metric setup moved to extension method --------- Co-authored-by: Ramon Smits <[email protected]>
1 parent c9a8215 commit b8c6fbf

File tree

15 files changed

+130
-189
lines changed

15 files changed

+130
-189
lines changed

src/Directory.Packages.props

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
<PackageVersion Include="NUnit" Version="4.3.2" />
5151
<PackageVersion Include="NUnit.Analyzers" Version="4.6.0" />
5252
<PackageVersion Include="NUnit3TestAdapter" Version="5.0.0" />
53+
<PackageVersion Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.9.0" />
54+
<PackageVersion Include="OpenTelemetry.Extensions.Hosting" Version="1.9.0" />
5355
<PackageVersion Include="Particular.Approvals" Version="2.0.1" />
5456
<PackageVersion Include="Particular.Licensing.Sources" Version="6.0.1" />
5557
<PackageVersion Include="Particular.LicensingComponent.Report" Version="1.0.0" />

src/ServiceControl.Audit.AcceptanceTests/TestSupport/ServiceControlComponentRunner.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ async Task InitializeServiceControl(ScenarioContext context)
5555
{
5656
var id = messageContext.NativeMessageId;
5757
var headers = messageContext.Headers;
58-
5958
var log = NServiceBus.Logging.LogManager.GetLogger<ServiceControlComponentRunner>();
6059
headers.TryGetValue(Headers.MessageId, out var originalMessageId);
6160
log.Debug($"OnMessage for message '{id}'({originalMessageId ?? string.Empty}).");

src/ServiceControl.Audit.UnitTests/ApprovalFiles/APIApprovals.PlatformSampleSettings.approved.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"ApiUrl": "http://localhost:8888/api",
1313
"Port": 8888,
1414
"PrintMetrics": false,
15+
"OtlpEndpointUrl": null,
1516
"Hostname": "localhost",
1617
"VirtualDirectory": "",
1718
"TransportType": "LearningTransport",

src/ServiceControl.Audit/App.config

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ These settings are only here so that we can debug ServiceControl while developin
88
<add key="ServiceControl.Audit/ServiceControlQueueAddress" value="Particular.ServiceControl" />
99
<add key="ServiceControl.Audit/HostName" value="localhost" />
1010
<add key="ServiceControl.Audit/DatabaseMaintenancePort" value="44445" />
11-
1211
<!-- DEVS - Pick a transport to run Auditing instance on -->
1312
<add key="ServiceControl.Audit/TransportType" value="LearningTransport" />
1413
<!--<add key="ServiceControl.Audit/TransportType" value="AmazonSQS" />-->

src/ServiceControl.Audit/Auditing/AuditIngestion.cs

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
using System;
44
using System.Collections.Generic;
5-
using System.Diagnostics;
5+
using System.Diagnostics.Metrics;
66
using System.Threading;
77
using System.Threading.Channels;
88
using System.Threading.Tasks;
@@ -14,18 +14,14 @@
1414
using Persistence;
1515
using Persistence.UnitOfWork;
1616
using ServiceControl.Infrastructure;
17-
using ServiceControl.Infrastructure.Metrics;
1817
using Transports;
1918

2019
class AuditIngestion : IHostedService
2120
{
22-
static readonly long FrequencyInMilliseconds = Stopwatch.Frequency / 1000;
23-
2421
public AuditIngestion(
2522
Settings settings,
2623
ITransportCustomization transportCustomization,
2724
TransportSettings transportSettings,
28-
Metrics metrics,
2925
IFailedAuditStorage failedImportsStorage,
3026
AuditIngestionCustomCheck.State ingestionState,
3127
AuditIngestor auditIngestor,
@@ -40,10 +36,6 @@ public AuditIngestion(
4036
this.settings = settings;
4137
this.applicationLifetime = applicationLifetime;
4238

43-
batchSizeMeter = metrics.GetMeter("Audit ingestion - batch size");
44-
batchDurationMeter = metrics.GetMeter("Audit ingestion - batch processing duration", FrequencyInMilliseconds);
45-
receivedMeter = metrics.GetCounter("Audit ingestion - received");
46-
4739
if (!transportSettings.MaxConcurrency.HasValue)
4840
{
4941
throw new ArgumentException("MaxConcurrency is not set in TransportSettings");
@@ -102,6 +94,7 @@ async Task EnsureStarted(CancellationToken cancellationToken = default)
10294
await stoppable.StopReceive(cancellationToken);
10395
logger.Info("Shutting down due to failed persistence health check. Infrastructure shut down completed");
10496
}
97+
10598
return;
10699
}
107100

@@ -168,6 +161,7 @@ async Task EnsureStopped(CancellationToken cancellationToken = default)
168161
logger.Info("Shutting down. Already stopped, skipping shut down");
169162
return; //Already stopped
170163
}
164+
171165
var stoppable = queueIngestor;
172166
queueIngestor = null;
173167
logger.Info("Shutting down. Infrastructure shut down commencing");
@@ -188,18 +182,22 @@ async Task EnsureStopped(CancellationToken cancellationToken = default)
188182

189183
async Task OnMessage(MessageContext messageContext, CancellationToken cancellationToken)
190184
{
191-
if (settings.MessageFilter != null && settings.MessageFilter(messageContext))
185+
using (new DurationRecorder(ingestionDuration))
192186
{
193-
return;
194-
}
187+
if (settings.MessageFilter != null && settings.MessageFilter(messageContext))
188+
{
189+
return;
190+
}
195191

196-
var taskCompletionSource = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
197-
messageContext.SetTaskCompletionSource(taskCompletionSource);
192+
var taskCompletionSource = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
193+
messageContext.SetTaskCompletionSource(taskCompletionSource);
198194

199-
receivedMeter.Mark();
195+
await channel.Writer.WriteAsync(messageContext, cancellationToken);
196+
await taskCompletionSource.Task;
200197

201-
await channel.Writer.WriteAsync(messageContext, cancellationToken);
202-
await taskCompletionSource.Task;
198+
ingestedMessagesCounter.Add(1);
199+
messageSize.Record(messageContext.Body.Length / 1024.0);
200+
}
203201
}
204202

205203
async Task Loop()
@@ -217,11 +215,14 @@ async Task Loop()
217215
contexts.Add(context);
218216
}
219217

220-
batchSizeMeter.Mark(contexts.Count);
221-
using (batchDurationMeter.Measure())
218+
auditBatchSize.Record(contexts.Count);
219+
220+
using (new DurationRecorder(auditBatchDuration))
222221
{
223222
await auditIngestor.Ingest(contexts);
224223
}
224+
225+
consecutiveBatchFailuresCounter.Record(0);
225226
}
226227
catch (OperationCanceledException)
227228
{
@@ -240,6 +241,9 @@ async Task Loop()
240241
{
241242
context.GetTaskCompletionSource().TrySetException(e);
242243
}
244+
245+
// no need to do interlocked increment since this is running sequential
246+
consecutiveBatchFailuresCounter.Record(consecutiveBatchFailures++);
243247
}
244248
finally
245249
{
@@ -251,8 +255,9 @@ async Task Loop()
251255

252256
TransportInfrastructure transportInfrastructure;
253257
IMessageReceiver queueIngestor;
258+
long consecutiveBatchFailures = 0;
254259

255-
readonly SemaphoreSlim startStopSemaphore = new SemaphoreSlim(1);
260+
readonly SemaphoreSlim startStopSemaphore = new(1);
256261
readonly string inputEndpoint;
257262
readonly ITransportCustomization transportCustomization;
258263
readonly TransportSettings transportSettings;
@@ -261,9 +266,12 @@ async Task Loop()
261266
readonly IAuditIngestionUnitOfWorkFactory unitOfWorkFactory;
262267
readonly Settings settings;
263268
readonly Channel<MessageContext> channel;
264-
readonly Meter batchSizeMeter;
265-
readonly Meter batchDurationMeter;
266-
readonly Counter receivedMeter;
269+
readonly Histogram<long> auditBatchSize = Telemetry.Meter.CreateHistogram<long>(Telemetry.CreateInstrumentName("ingestion", "batch_size"), description: "Audit ingestion average batch size");
270+
readonly Histogram<double> auditBatchDuration = Telemetry.Meter.CreateHistogram<double>(Telemetry.CreateInstrumentName("ingestion", "batch_duration"), unit: "ms", "Average audit message batch processing duration");
271+
readonly Histogram<double> messageSize = Telemetry.Meter.CreateHistogram<double>(Telemetry.CreateInstrumentName("ingestion", "message_size"), unit: "kilobytes", description: "Average audit message body size");
272+
readonly Counter<long> ingestedMessagesCounter = Telemetry.Meter.CreateCounter<long>(Telemetry.CreateInstrumentName("ingestion", "count"), description: "Successful ingested audit message count");
273+
readonly Histogram<long> consecutiveBatchFailuresCounter = Telemetry.Meter.CreateHistogram<long>(Telemetry.CreateInstrumentName("ingestion", "consecutive_batch_failures"), unit: "count", description: "Consecutive audit ingestion batch failure");
274+
readonly Histogram<double> ingestionDuration = Telemetry.Meter.CreateHistogram<double>(Telemetry.CreateInstrumentName("ingestion", "duration"), unit: "ms", description: "Average incoming audit message processing duration");
267275
readonly Watchdog watchdog;
268276
readonly Task ingestionWorker;
269277
readonly IHostApplicationLifetime applicationLifetime;

src/ServiceControl.Audit/Auditing/AuditIngestionFaultPolicy.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
{
33
using System;
44
using System.Diagnostics;
5+
using System.Diagnostics.Metrics;
56
using System.IO;
67
using System.Runtime.InteropServices;
78
using System.Runtime.Versioning;
@@ -37,10 +38,13 @@ public async Task<ErrorHandleResult> OnError(ErrorContext errorContext, Cancella
3738
//Same as recoverability policy in NServiceBusFactory
3839
if (errorContext.ImmediateProcessingFailures < 3)
3940
{
41+
retryCounter.Add(1);
4042
return ErrorHandleResult.RetryRequired;
4143
}
4244

4345
await StoreFailedMessageDocument(errorContext, cancellationToken);
46+
47+
failedCounter.Add(1);
4448
return ErrorHandleResult.Handled;
4549
}
4650

@@ -100,6 +104,9 @@ void WriteToEventLog(string message)
100104
EventLog.WriteEntry(EventSourceCreator.SourceName, message, EventLogEntryType.Error);
101105
}
102106

107+
readonly Counter<long> retryCounter = Telemetry.Meter.CreateCounter<long>(Telemetry.CreateInstrumentName("ingestion", "retry"), description: "Audit ingestion retries count");
108+
readonly Counter<long> failedCounter = Telemetry.Meter.CreateCounter<long>(Telemetry.CreateInstrumentName("ingestion", "failed"), description: "Audit ingestion failure count");
109+
103110
static readonly ILog log = LogManager.GetLogger<AuditIngestionFaultPolicy>();
104111
}
105112
}

src/ServiceControl.Audit/Auditing/AuditIngestor.cs

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
using System;
44
using System.Collections.Generic;
5-
using System.Diagnostics;
5+
using System.Diagnostics.Metrics;
66
using System.Linq;
77
using System.Threading.Tasks;
88
using Infrastructure.Settings;
@@ -14,13 +14,11 @@
1414
using Persistence.UnitOfWork;
1515
using Recoverability;
1616
using SagaAudit;
17-
using ServiceControl.Infrastructure.Metrics;
1817
using ServiceControl.Transports;
1918

2019
public class AuditIngestor
2120
{
2221
public AuditIngestor(
23-
Metrics metrics,
2422
Settings settings,
2523
IAuditIngestionUnitOfWorkFactory unitOfWorkFactory,
2624
EndpointInstanceMonitoring endpointInstanceMonitoring,
@@ -32,50 +30,28 @@ ITransportCustomization transportCustomization
3230
{
3331
this.settings = settings;
3432
this.messageDispatcher = messageDispatcher;
35-
36-
var ingestedAuditMeter = metrics.GetCounter("Audit ingestion - ingested audit");
37-
var ingestedSagaAuditMeter = metrics.GetCounter("Audit ingestion - ingested saga audit");
38-
var auditBulkInsertDurationMeter = metrics.GetMeter("Audit ingestion - audit bulk insert duration", FrequencyInMilliseconds);
39-
var sagaAuditBulkInsertDurationMeter = metrics.GetMeter("Audit ingestion - saga audit bulk insert duration", FrequencyInMilliseconds);
40-
var bulkInsertCommitDurationMeter = metrics.GetMeter("Audit ingestion - bulk insert commit duration", FrequencyInMilliseconds);
41-
42-
var enrichers = new IEnrichImportedAuditMessages[]
43-
{
44-
new MessageTypeEnricher(),
45-
new EnrichWithTrackingIds(),
46-
new ProcessingStatisticsEnricher(),
47-
new DetectNewEndpointsFromAuditImportsEnricher(endpointInstanceMonitoring),
48-
new DetectSuccessfulRetriesEnricher(),
49-
new SagaRelationshipsEnricher()
50-
}.Concat(auditEnrichers).ToArray();
33+
var enrichers = new IEnrichImportedAuditMessages[] { new MessageTypeEnricher(), new EnrichWithTrackingIds(), new ProcessingStatisticsEnricher(), new DetectNewEndpointsFromAuditImportsEnricher(endpointInstanceMonitoring), new DetectSuccessfulRetriesEnricher(), new SagaRelationshipsEnricher() }.Concat(auditEnrichers).ToArray();
5134

5235
logQueueAddress = transportCustomization.ToTransportQualifiedQueueName(settings.AuditLogQueue);
5336

54-
auditPersister = new AuditPersister(unitOfWorkFactory, enrichers, ingestedAuditMeter, ingestedSagaAuditMeter, auditBulkInsertDurationMeter, sagaAuditBulkInsertDurationMeter, bulkInsertCommitDurationMeter, messageSession, messageDispatcher);
37+
auditPersister = new AuditPersister(
38+
unitOfWorkFactory,
39+
enrichers,
40+
messageSession,
41+
messageDispatcher
42+
);
5543
}
5644

5745
public async Task Ingest(List<MessageContext> contexts)
5846
{
59-
if (Log.IsDebugEnabled)
60-
{
61-
Log.Debug($"Ingesting {contexts.Count} message contexts");
62-
}
63-
6447
var stored = await auditPersister.Persist(contexts);
6548

6649
try
6750
{
6851
if (settings.ForwardAuditMessages)
6952
{
70-
if (Log.IsDebugEnabled)
71-
{
72-
Log.Debug($"Forwarding {stored.Count} messages");
73-
}
7453
await Forward(stored, logQueueAddress);
75-
if (Log.IsDebugEnabled)
76-
{
77-
Log.Debug("Forwarded messages");
78-
}
54+
forwardedMessagesCounter.Add(stored.Count);
7955
}
8056

8157
foreach (var context in contexts)
@@ -85,10 +61,7 @@ public async Task Ingest(List<MessageContext> contexts)
8561
}
8662
catch (Exception e)
8763
{
88-
if (Log.IsWarnEnabled)
89-
{
90-
Log.Warn("Forwarding messages failed", e);
91-
}
64+
Log.Warn("Forwarding messages failed", e);
9265

9366
// making sure to rethrow so that all messages get marked as failed
9467
throw;
@@ -158,8 +131,8 @@ public async Task VerifyCanReachForwardingAddress()
158131
readonly Settings settings;
159132
readonly Lazy<IMessageDispatcher> messageDispatcher;
160133
readonly string logQueueAddress;
134+
readonly Counter<long> forwardedMessagesCounter = Telemetry.Meter.CreateCounter<long>(Telemetry.CreateInstrumentName("ingestion", "forwarded_count"), description: "Audit ingestion forwarded message count");
161135

162-
static readonly long FrequencyInMilliseconds = Stopwatch.Frequency / 1000;
163136
static readonly ILog Log = LogManager.GetLogger<AuditIngestor>();
164137
}
165138
}

0 commit comments

Comments
 (0)