Skip to content

Commit 86d1612

Browse files
Add the CheckDirtyMemory custom check to the primary instance
1 parent 0d5325f commit 86d1612

File tree

5 files changed

+144
-0
lines changed

5 files changed

+144
-0
lines changed

src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ public void AddPersistence(IServiceCollection services)
4949
services.AddCustomCheck<CheckRavenDBIndexLag>();
5050
services.AddCustomCheck<CheckFreeDiskSpace>();
5151
services.AddCustomCheck<CheckMinimumStorageRequiredForIngestion>();
52+
services.AddCustomCheck<CheckDirtyMemory>();
5253

54+
services.AddSingleton<MemoryInformationRetriever>();
5355
services.AddSingleton<OperationsManager>();
5456

5557
services.AddSingleton<IArchiveMessages, MessageArchiver>();
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
namespace ServiceControl.Persistence.RavenDB.CustomChecks;
2+
3+
using System;
4+
using System.Collections.Generic;
5+
using System.Threading;
6+
using System.Threading.Tasks;
7+
using NServiceBus.CustomChecks;
8+
using NServiceBus.Logging;
9+
10+
class CheckDirtyMemory(MemoryInformationRetriever memoryInformationRetriever) : CustomCheck("RavenDB dirty memory trends", "ServiceControl Health", TimeSpan.FromMinutes(5))
11+
{
12+
readonly List<int> lastDirtyMemoryReads = [];
13+
public override async Task<CheckResult> PerformCheck(CancellationToken cancellationToken = default)
14+
{
15+
var (isHighDirty, dirtyMemoryKb) = await memoryInformationRetriever.GetMemoryInformation(cancellationToken);
16+
17+
if (isHighDirty)
18+
{
19+
var message = $"There is a high level of RavenDB dirty memory ({dirtyMemoryKb}kb). Check the ServiceControl " +
20+
"troubleshooting guide for guidance on how to mitigate the issue.";
21+
Log.Warn(message);
22+
return CheckResult.Failed(message);
23+
}
24+
25+
lastDirtyMemoryReads.Add(dirtyMemoryKb);
26+
if (lastDirtyMemoryReads.Count > 20)
27+
{
28+
//cap the list at 20 which means we're keeping about 1 hour and 40 minutes of data
29+
lastDirtyMemoryReads.RemoveAt(0);
30+
}
31+
32+
switch (lastDirtyMemoryReads.Count)
33+
{
34+
case < 3:
35+
Log.Debug("Not enough RavenDB dirty memory data in the series to calculate a trend.");
36+
break;
37+
// TODO do we need a threshold below which the check never fails?
38+
// Three means we'll be observing for 15 minutes before calculating the trend
39+
case >= 3 when AnalyzeTrendUsingRegression(lastDirtyMemoryReads) == TrendDirection.Increasing:
40+
{
41+
var message = $"RavenDB dirty memory is increasing. Last available value is {dirtyMemoryKb}kb. " +
42+
$"Check the ServiceControl troubleshooting guide for guidance on how to mitigate the issue.";
43+
Log.Warn(message);
44+
return CheckResult.Failed(message);
45+
}
46+
47+
default:
48+
// NOP
49+
break;
50+
}
51+
52+
return CheckResult.Pass;
53+
}
54+
55+
static TrendDirection AnalyzeTrendUsingRegression(List<int> values)
56+
{
57+
if (values is not { Count: > 1 })
58+
{
59+
throw new ArgumentException("Need at least two values to determine a trend");
60+
}
61+
62+
// Calculate slope using linear regression
63+
double numberOfPoints = values.Count;
64+
double sumOfIndices = 0;
65+
double sumOfValues = 0;
66+
double sumOfIndicesMultipliedByValues = 0;
67+
double sumOfIndicesSquared = 0;
68+
69+
for (int i = 0; i < values.Count; i++)
70+
{
71+
double index = i;
72+
double value = values[i];
73+
74+
sumOfIndices += index;
75+
sumOfValues += value;
76+
sumOfIndicesMultipliedByValues += index * value;
77+
sumOfIndicesSquared += index * index;
78+
}
79+
80+
// Slope formula: (n*Σxy - Σx*Σy) / (n*Σx² - (Σx)²)
81+
double slopeNumerator = (numberOfPoints * sumOfIndicesMultipliedByValues) - (sumOfIndices * sumOfValues);
82+
double slopeDenominator = (numberOfPoints * sumOfIndicesSquared) - (sumOfIndices * sumOfIndices);
83+
double slope = slopeNumerator / slopeDenominator;
84+
85+
// Determine trend based on slope
86+
const double slopeThreshold = 0.001; // Small threshold to handle floating-point precision
87+
if (Math.Abs(slope) < slopeThreshold)
88+
{
89+
return TrendDirection.Flat;
90+
}
91+
92+
return slope > 0 ? TrendDirection.Increasing : TrendDirection.Decreasing;
93+
}
94+
95+
enum TrendDirection
96+
{
97+
Increasing,
98+
Decreasing,
99+
Flat
100+
}
101+
102+
static readonly ILog Log = LogManager.GetLogger<CheckDirtyMemory>();
103+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
namespace ServiceControl.Persistence.RavenDB;
2+
3+
using System;
4+
using System.Net.Http;
5+
using System.Text.Json;
6+
using System.Threading;
7+
using System.Threading.Tasks;
8+
9+
class MemoryInformationRetriever(RavenPersisterSettings persisterSettings)
10+
{
11+
// TODO what does a connection string look like? Is it only a URI or could it contain other stuff?
12+
readonly HttpClient client = new() { BaseAddress = new Uri(persisterSettings.ConnectionString) };
13+
14+
record ResponseDto
15+
{
16+
public MemoryInformation MemoryInformation { get; set; }
17+
}
18+
19+
record MemoryInformation
20+
{
21+
public bool IsHighDirty { get; set; }
22+
public string DirtyMemory { get; set; }
23+
}
24+
25+
public async Task<(bool IsHighDirty, int DirtyMemoryKb)> GetMemoryInformation(CancellationToken cancellationToken = default)
26+
{
27+
var httpResponse = await client.GetAsync("/admin/debug/memory/stats?includeThreads=false&includeMappings=false", cancellationToken);
28+
var responseDto = JsonSerializer.Deserialize<ResponseDto>(await httpResponse.Content.ReadAsStringAsync(cancellationToken));
29+
30+
var values = responseDto.MemoryInformation.DirtyMemory.Split(' ');
31+
if (!string.Equals(values[1], "KBytes", StringComparison.OrdinalIgnoreCase))
32+
{
33+
throw new InvalidOperationException($"Unexpected response. Was expecting memory details in KBytes, instead received: {responseDto.MemoryInformation.DirtyMemory}");
34+
}
35+
return (responseDto.MemoryInformation.IsHighDirty, int.Parse(values[0]));
36+
}
37+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
ServiceControl Health: Error Database Index Errors
22
ServiceControl Health: Error Database Index Lag
33
ServiceControl Health: Message Ingestion Process
4+
ServiceControl Health: RavenDB dirty memory trends
45
Storage space: ServiceControl database
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
ServiceControl Health: Error Database Index Errors
22
ServiceControl Health: Error Database Index Lag
33
ServiceControl Health: Message Ingestion Process
4+
ServiceControl Health: RavenDB dirty memory trends
45
Storage space: ServiceControl database

0 commit comments

Comments
 (0)