Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
- name: Setup .NET SDK
uses: actions/[email protected]
with:
dotnet-version: 8.0.x
dotnet-version: 8.0.406
- name: Download RavenDB Server
run: ./tools/download-ravendb-server.ps1
- name: Build
Expand Down
3 changes: 1 addition & 2 deletions global.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"sdk": {
"version": "8.0.400",
"rollForward": "latestFeature"
"version": "8.0.406"
},
"msbuild-sdks": {
"Microsoft.Build.NoTargets": "3.7.56"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
namespace ServiceControl.Audit.Persistence.RavenDB.CustomChecks;

using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using NServiceBus.CustomChecks;
using NServiceBus.Logging;

class CheckDirtyMemory(MemoryInformationRetriever memoryInformationRetriever) : CustomCheck("RavenDB dirty memory trends", "ServiceControl.Audit Health", TimeSpan.FromMinutes(5))
{
readonly List<int> lastDirtyMemoryReads = [];
public override async Task<CheckResult> PerformCheck(CancellationToken cancellationToken = default)
{
var (isHighDirty, dirtyMemoryKb) = await memoryInformationRetriever.GetMemoryInformation(cancellationToken);

if (isHighDirty)
{
var message = $"There is a high level of RavenDB dirty memory ({dirtyMemoryKb}kb). Check the ServiceControl " +
"troubleshooting guide for guidance on how to mitigate the issue. " +
"Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information.";
Log.Warn(message);
return CheckResult.Failed(message);
}

lastDirtyMemoryReads.Add(dirtyMemoryKb);
if (lastDirtyMemoryReads.Count > 20)
{
//cap the list at 20 which means we're keeping about 1 hour and 40 minutes of data
lastDirtyMemoryReads.RemoveAt(0);
}

switch (lastDirtyMemoryReads.Count)
{
case < 3:
Log.Debug("Not enough RavenDB dirty memory data in the series to calculate a trend.");
break;
// TODO do we need a threshold below which the check never fails?
// Three means we'll be observing for 15 minutes before calculating the trend
case >= 3 when AnalyzeTrendUsingRegression(lastDirtyMemoryReads) == TrendDirection.Increasing:
{
var message = $"RavenDB dirty memory is increasing. Last available value is {dirtyMemoryKb}kb. " +
$"Check the ServiceControl troubleshooting guide for guidance on how to mitigate the issue. " +
$"Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information.";
Log.Warn(message);
return CheckResult.Failed(message);
}

default:
// NOP
break;
}

return CheckResult.Pass;
}

static TrendDirection AnalyzeTrendUsingRegression(List<int> values)
{
if (values is not { Count: > 1 })
{
throw new ArgumentException("Need at least two values to determine a trend");
}

// Calculate slope using linear regression
double numberOfPoints = values.Count;
double sumOfIndices = 0;
double sumOfValues = 0;
double sumOfIndicesMultipliedByValues = 0;
double sumOfIndicesSquared = 0;

for (int i = 0; i < values.Count; i++)
{
double index = i;
double value = values[i];

sumOfIndices += index;
sumOfValues += value;
sumOfIndicesMultipliedByValues += index * value;
sumOfIndicesSquared += index * index;
}

// Slope formula: (n*Σxy - Σx*Σy) / (n*Σx² - (Σx)²)
double slopeNumerator = (numberOfPoints * sumOfIndicesMultipliedByValues) - (sumOfIndices * sumOfValues);
double slopeDenominator = (numberOfPoints * sumOfIndicesSquared) - (sumOfIndices * sumOfIndices);
double slope = slopeNumerator / slopeDenominator;

// Determine trend based on slope
const double slopeThreshold = 0.001; // Small threshold to handle floating-point precision
if (Math.Abs(slope) < slopeThreshold)
{
return TrendDirection.Flat;
}

return slope > 0 ? TrendDirection.Increasing : TrendDirection.Decreasing;
}

enum TrendDirection
{
Increasing,
Decreasing,
Flat
}

static readonly ILog Log = LogManager.GetLogger<CheckDirtyMemory>();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
namespace ServiceControl.Audit.Persistence.RavenDB;

using System;
using System.Net.Http;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;

class MemoryInformationRetriever(DatabaseConfiguration databaseConfiguration)
{
// TODO what does a connection string look like? Is it only a URI or could it contain other stuff?
// The ?? operator is needed because ServerUrl is populated when running embedded and connection string when running in external mode.
// However the tricky part is that when tests are run they behave like if it was external mode
readonly HttpClient client = new() { BaseAddress = new Uri(databaseConfiguration.ServerConfiguration.ServerUrl ?? databaseConfiguration.ServerConfiguration.ConnectionString) };

record ResponseDto
{
public MemoryInformation MemoryInformation { get; set; }
}

record MemoryInformation
{
public bool IsHighDirty { get; set; }
public string DirtyMemory { get; set; }
}

public async Task<(bool IsHighDirty, int DirtyMemoryKb)> GetMemoryInformation(CancellationToken cancellationToken = default)
{
var httpResponse = await client.GetAsync("/admin/debug/memory/stats?includeThreads=false&includeMappings=false", cancellationToken);
var responseDto = JsonSerializer.Deserialize<ResponseDto>(await httpResponse.Content.ReadAsStringAsync(cancellationToken));

var values = responseDto.MemoryInformation.DirtyMemory.Split(' ');
if (!string.Equals(values[1], "KBytes", StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException($"Unexpected response. Was expecting memory details in KBytes, instead received: {responseDto.MemoryInformation.DirtyMemory}");
}
return (responseDto.MemoryInformation.IsHighDirty, int.Parse(values[0]));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public void AddPersistence(IServiceCollection services)
static void ConfigureLifecycle(IServiceCollection services, DatabaseConfiguration databaseConfiguration)
{
services.AddSingleton(databaseConfiguration);
services.AddSingleton<MemoryInformationRetriever>();

services.AddSingleton<IRavenSessionProvider, RavenSessionProvider>();
services.AddHostedService<RavenPersistenceLifecycleHostedService>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ServiceControl.Audit Health: Audit Database Index Lag
ServiceControl.Audit Health: Audit Message Ingestion Process
ServiceControl.Audit Health: RavenDB dirty memory trends
Storage space: ServiceControl.Audit database
2 changes: 2 additions & 0 deletions src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@
services.AddCustomCheck<CheckRavenDBIndexLag>();
services.AddCustomCheck<CheckFreeDiskSpace>();
services.AddCustomCheck<CheckMinimumStorageRequiredForIngestion>();
services.AddCustomCheck<CheckDirtyMemory>();

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-Default

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-Default

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SqlServer

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SqlServer

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureServiceBus

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureServiceBus

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-RabbitMQ

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-RabbitMQ

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureStorageQueues

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureStorageQueues

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SQS

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SQS

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenAcceptance

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenAcceptance

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenPersistence

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenPersistence

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PostgreSQL

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 52 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PostgreSQL

The type or namespace name 'CheckDirtyMemory' could not be found (are you missing a using directive or an assembly reference?)

services.AddSingleton<MemoryInformationRetriever>();

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-Default

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-Default

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SqlServer

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SqlServer

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureServiceBus

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureServiceBus

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-RabbitMQ

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-RabbitMQ

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureStorageQueues

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-AzureStorageQueues

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SQS

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-SQS

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenAcceptance

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenAcceptance

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenPersistence

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PrimaryRavenPersistence

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PostgreSQL

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 54 in src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs

View workflow job for this annotation

GitHub Actions / Linux-PostgreSQL

The type or namespace name 'MemoryInformationRetriever' could not be found (are you missing a using directive or an assembly reference?)
services.AddSingleton<OperationsManager>();

services.AddSingleton<IArchiveMessages, MessageArchiver>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
namespace ServiceControl.Persistence.RavenDB.CustomChecks;

using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using NServiceBus.CustomChecks;
using NServiceBus.Logging;

class CheckDirtyMemory(MemoryInformationRetriever memoryInformationRetriever) : CustomCheck("RavenDB dirty memory trends", "ServiceControl Health", TimeSpan.FromMinutes(5))
{
readonly List<int> lastDirtyMemoryReads = [];
public override async Task<CheckResult> PerformCheck(CancellationToken cancellationToken = default)
{
var (isHighDirty, dirtyMemoryKb) = await memoryInformationRetriever.GetMemoryInformation(cancellationToken);

if (isHighDirty)
{
var message = $"There is a high level of RavenDB dirty memory ({dirtyMemoryKb}kb). Check the ServiceControl " +
"troubleshooting guide for guidance on how to mitigate the issue. " +
"Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information.";
Log.Warn(message);
return CheckResult.Failed(message);
}

lastDirtyMemoryReads.Add(dirtyMemoryKb);
if (lastDirtyMemoryReads.Count > 20)
{
//cap the list at 20 which means we're keeping about 1 hour and 40 minutes of data
lastDirtyMemoryReads.RemoveAt(0);
}

switch (lastDirtyMemoryReads.Count)
{
case < 3:
Log.Debug("Not enough RavenDB dirty memory data in the series to calculate a trend.");
break;
// TODO do we need a threshold below which the check never fails?
// Three means we'll be observing for 15 minutes before calculating the trend
case >= 3 when AnalyzeTrendUsingRegression(lastDirtyMemoryReads) == TrendDirection.Increasing:
{
var message = $"RavenDB dirty memory is increasing. Last available value is {dirtyMemoryKb}kb. " +
$"Check the ServiceControl troubleshooting guide for guidance on how to mitigate the issue. " +
$"Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information.";
Log.Warn(message);
return CheckResult.Failed(message);
}

default:
// NOP
break;
}

return CheckResult.Pass;
}

static TrendDirection AnalyzeTrendUsingRegression(List<int> values)
{
if (values is not { Count: > 1 })
{
throw new ArgumentException("Need at least two values to determine a trend");
}

// Calculate slope using linear regression
double numberOfPoints = values.Count;
double sumOfIndices = 0;
double sumOfValues = 0;
double sumOfIndicesMultipliedByValues = 0;
double sumOfIndicesSquared = 0;

for (int i = 0; i < values.Count; i++)
{
double index = i;
double value = values[i];

sumOfIndices += index;
sumOfValues += value;
sumOfIndicesMultipliedByValues += index * value;
sumOfIndicesSquared += index * index;
}

// Slope formula: (n*Σxy - Σx*Σy) / (n*Σx² - (Σx)²)
double slopeNumerator = (numberOfPoints * sumOfIndicesMultipliedByValues) - (sumOfIndices * sumOfValues);
double slopeDenominator = (numberOfPoints * sumOfIndicesSquared) - (sumOfIndices * sumOfIndices);
double slope = slopeNumerator / slopeDenominator;

// Determine trend based on slope
const double slopeThreshold = 0.001; // Small threshold to handle floating-point precision
if (Math.Abs(slope) < slopeThreshold)
{
return TrendDirection.Flat;
}

return slope > 0 ? TrendDirection.Increasing : TrendDirection.Decreasing;
}

enum TrendDirection
{
Increasing,
Decreasing,
Flat
}

static readonly ILog Log = LogManager.GetLogger<CheckDirtyMemory>();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
namespace ServiceControl.Persistence.RavenDB;

using System;
using System.Net.Http;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;

class MemoryInformationRetriever(RavenPersisterSettings persisterSettings)
{
// TODO what does a connection string look like? Is it only a URI or could it contain other stuff?
readonly HttpClient client = new() { BaseAddress = new Uri(persisterSettings.ConnectionString) };

record ResponseDto
{
public MemoryInformation MemoryInformation { get; set; }
}

record MemoryInformation
{
public bool IsHighDirty { get; set; }
public string DirtyMemory { get; set; }
}

public async Task<(bool IsHighDirty, int DirtyMemoryKb)> GetMemoryInformation(CancellationToken cancellationToken = default)
{
var httpResponse = await client.GetAsync("/admin/debug/memory/stats?includeThreads=false&includeMappings=false", cancellationToken);
var responseDto = JsonSerializer.Deserialize<ResponseDto>(await httpResponse.Content.ReadAsStringAsync(cancellationToken));

var values = responseDto.MemoryInformation.DirtyMemory.Split(' ');
if (!string.Equals(values[1], "KBytes", StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException($"Unexpected response. Was expecting memory details in KBytes, instead received: {responseDto.MemoryInformation.DirtyMemory}");
}
return (responseDto.MemoryInformation.IsHighDirty, int.Parse(values[0]));
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ServiceControl Health: Error Database Index Errors
ServiceControl Health: Error Database Index Lag
ServiceControl Health: Message Ingestion Process
ServiceControl Health: RavenDB dirty memory trends
Storage space: ServiceControl database
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ServiceControl Health: Error Database Index Errors
ServiceControl Health: Error Database Index Lag
ServiceControl Health: Message Ingestion Process
ServiceControl Health: RavenDB dirty memory trends
Storage space: ServiceControl database
Loading