1+ namespace ServiceControl . Persistence . RavenDB . CustomChecks ;
2+
3+ using System ;
4+ using System . Collections . Generic ;
5+ using System . Threading ;
6+ using System . Threading . Tasks ;
7+ using NServiceBus . CustomChecks ;
8+ using NServiceBus . Logging ;
9+
10+ class CheckDirtyMemory ( MemoryInformationRetriever memoryInformationRetriever ) : CustomCheck ( "RavenDB dirty memory trends" , "ServiceControl Health" , TimeSpan . FromMinutes ( 5 ) )
11+ {
12+ readonly List < int > lastDirtyMemoryReads = [ ] ;
13+ public override async Task < CheckResult > PerformCheck ( CancellationToken cancellationToken = default )
14+ {
15+ var ( isHighDirty , dirtyMemoryKb ) = await memoryInformationRetriever . GetMemoryInformation ( cancellationToken ) ;
16+
17+ if ( isHighDirty )
18+ {
19+ var message = $ "There is a high level of RavenDB dirty memory ({ dirtyMemoryKb } kb). Check the ServiceControl " +
20+ "troubleshooting guide for guidance on how to mitigate the issue." ;
21+ Log . Warn ( message ) ;
22+ return CheckResult . Failed ( message ) ;
23+ }
24+
25+ lastDirtyMemoryReads . Add ( dirtyMemoryKb ) ;
26+ if ( lastDirtyMemoryReads . Count > 20 )
27+ {
28+ //cap the list at 20 which means we're keeping about 1 hour and 40 minutes of data
29+ lastDirtyMemoryReads . RemoveAt ( 0 ) ;
30+ }
31+
32+ switch ( lastDirtyMemoryReads . Count )
33+ {
34+ case < 3 :
35+ Log . Debug ( "Not enough RavenDB dirty memory data in the series to calculate a trend." ) ;
36+ break ;
37+ // TODO do we need a threshold below which the check never fails?
38+ // Three means we'll be observing for 15 minutes before calculating the trend
39+ case >= 3 when AnalyzeTrendUsingRegression ( lastDirtyMemoryReads ) == TrendDirection . Increasing :
40+ {
41+ var message = $ "RavenDB dirty memory is increasing. Last available value is { dirtyMemoryKb } kb. " +
42+ $ "Check the ServiceControl troubleshooting guide for guidance on how to mitigate the issue.";
43+ Log . Warn ( message ) ;
44+ return CheckResult . Failed ( message ) ;
45+ }
46+
47+ default :
48+ // NOP
49+ break ;
50+ }
51+
52+ return CheckResult . Pass ;
53+ }
54+
55+ static TrendDirection AnalyzeTrendUsingRegression ( List < int > values )
56+ {
57+ if ( values is not { Count : > 1 } )
58+ {
59+ throw new ArgumentException ( "Need at least two values to determine a trend" ) ;
60+ }
61+
62+ // Calculate slope using linear regression
63+ double numberOfPoints = values . Count ;
64+ double sumOfIndices = 0 ;
65+ double sumOfValues = 0 ;
66+ double sumOfIndicesMultipliedByValues = 0 ;
67+ double sumOfIndicesSquared = 0 ;
68+
69+ for ( int i = 0 ; i < values . Count ; i ++ )
70+ {
71+ double index = i ;
72+ double value = values [ i ] ;
73+
74+ sumOfIndices += index ;
75+ sumOfValues += value ;
76+ sumOfIndicesMultipliedByValues += index * value ;
77+ sumOfIndicesSquared += index * index ;
78+ }
79+
80+ // Slope formula: (n*Σxy - Σx*Σy) / (n*Σx² - (Σx)²)
81+ double slopeNumerator = ( numberOfPoints * sumOfIndicesMultipliedByValues ) - ( sumOfIndices * sumOfValues ) ;
82+ double slopeDenominator = ( numberOfPoints * sumOfIndicesSquared ) - ( sumOfIndices * sumOfIndices ) ;
83+ double slope = slopeNumerator / slopeDenominator ;
84+
85+ // Determine trend based on slope
86+ const double slopeThreshold = 0.001 ; // Small threshold to handle floating-point precision
87+ if ( Math . Abs ( slope ) < slopeThreshold )
88+ {
89+ return TrendDirection . Flat ;
90+ }
91+
92+ return slope > 0 ? TrendDirection . Increasing : TrendDirection . Decreasing ;
93+ }
94+
95+ enum TrendDirection
96+ {
97+ Increasing ,
98+ Decreasing ,
99+ Flat
100+ }
101+
102+ static readonly ILog Log = LogManager . GetLogger < CheckDirtyMemory > ( ) ;
103+ }
0 commit comments