Skip to content

Commit a563f84

Browse files
craig[bot]angles-n-daemons
andcommitted
Merge #144414
144414: structlogging: conditionally log hot ranges if the node is burdened r=angles-n-daemons a=angles-n-daemons structlogging: conditionally log hot ranges if the node is burdened Prior to this change, if the system were enabled to, the hot ranges would log on a regular interval from each node in the system. This interval however was long (4h by default) and would leave a lot of gaps in the system, any hotspot which wasn't effectively continuous was likely to be missed from it. So that partner teams have better historical visibility into the system, we here add a check every minute for whether the node is in a "burdened" state. Burdened here is defined as any given range using greater than 250ms cpu time per second. If it is, we log the hot ranges for that node. Doing so allows for both better precision, if a range is hot, it's likely to be logged, due to the minute by minute check, without sacrificing recall, in that these decisions are made at a node local level. For multi-tenant deployments, this check happens instead once every 5 minutes, and polls & logs the entire cluster. Fixes: #138767 Epic: CRDB-43150 Release note (general): If the cluster setting `server.telemetry.hot_ranges_stats.enabled` is enabled, Co-authored-by: Brian Dillmann <[email protected]>
2 parents 3bc72aa + e0d4276 commit a563f84

File tree

5 files changed

+298
-72
lines changed

5 files changed

+298
-72
lines changed

pkg/server/status.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3059,6 +3059,7 @@ func (s *systemStatusServer) localHotRanges(
30593059
slices.SortFunc(resp.Ranges, func(a, b *serverpb.HotRangesResponseV2_HotRange) int {
30603060
return cmp.Compare(a.CPUTimePerSecond, b.CPUTimePerSecond)
30613061
})
3062+
30623063
// truncate the response if localLimit is set
30633064
if localLimit != 0 && localLimit < len(resp.Ranges) {
30643065
resp.Ranges = resp.Ranges[:localLimit]

pkg/server/structlogging/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@ go_test(
4141
"//pkg/base",
4242
"//pkg/kv/kvserver",
4343
"//pkg/kv/kvserver/allocator/plan",
44+
"//pkg/roachpb",
4445
"//pkg/security/securityassets",
4546
"//pkg/security/securitytest",
4647
"//pkg/server",
48+
"//pkg/settings/cluster",
4749
"//pkg/testutils",
4850
"//pkg/testutils/serverutils",
4951
"//pkg/testutils/skip",
@@ -54,5 +56,6 @@ go_test(
5456
"//pkg/util/log/logpb",
5557
"//pkg/util/randutil",
5658
"//pkg/util/syncutil",
59+
"@com_github_stretchr_testify//require",
5760
],
5861
)

pkg/server/structlogging/hot_ranges_log.go

Lines changed: 105 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,16 @@ import (
2525
)
2626

2727
// ReportTopHottestRanges limits the number of ranges to be reported per iteration
28-
const ReportTopHottestRanges = 5
28+
var ReportTopHottestRanges int32 = 5
29+
30+
// CheckInterval is the interval at which the system checks
31+
// whether or not to log the hot ranges.
32+
var CheckInterval = time.Second
33+
34+
// TestLoopChannel triggers the hot ranges logging loop to start again.
35+
// It's useful in the context of a test, where we don't want to wait
36+
// for whatever the last time the interval was.
37+
var TestLoopChannel = make(chan struct{}, 1)
2938

3039
var TelemetryHotRangesStatsInterval = settings.RegisterDurationSetting(
3140
settings.ApplicationLevel,
@@ -48,18 +57,38 @@ var TelemetryHotRangesStatsLoggingDelay = settings.RegisterDurationSetting(
4857
1*time.Second,
4958
)
5059

60+
// TelemetryHotRangesStatsCPUThreshold defines the cpu duration
61+
// per second which needs to be exceeded for the system to automatically
62+
// log the hot ranges. It tracks the reasoning that the kv layer
63+
// uses to determine when to begin sampling reads for a given
64+
// range in the keyspace, more information found where the cluster
65+
// setting SplitByLoadCPUThreshold is defined.
66+
var TelemetryHotRangesStatsCPUThreshold = settings.RegisterDurationSetting(
67+
settings.SystemOnly,
68+
"server.telemetry.hot_ranges_stats.cpu_threshold",
69+
"the cpu time over which the system will automatically begin logging hot ranges",
70+
time.Second/4,
71+
)
72+
5173
// hotRangesLoggingScheduler is responsible for logging index usage stats
5274
// on a scheduled interval.
5375
type hotRangesLoggingScheduler struct {
54-
ie sql.InternalExecutor
5576
sServer serverpb.TenantStatusServer
5677
st *cluster.Settings
5778
stopper *stop.Stopper
5879
job *jobs.Job
5980
multiTenant bool
81+
lastLogged time.Time
6082
}
6183

62-
// StartHotRangesLoggingScheduler starts the capture index usage statistics logging scheduler.
84+
// StartHotRangesLoggingScheduler starts the hot range log task
85+
// or job.
86+
//
87+
// For system tenants, or single tenant deployments, it runs as
88+
// a task on each node, logging only the ranges on the node in
89+
// which it runs. For app tenants in a multi-tenant deployment,
90+
// it runs on a single node in the sql cluster, applying a fanout
91+
// to the kv layer to collect the hot ranges from all nodes.
6392
func StartHotRangesLoggingScheduler(
6493
ctx context.Context,
6594
stopper *stop.Stopper,
@@ -68,30 +97,32 @@ func StartHotRangesLoggingScheduler(
6897
st *cluster.Settings,
6998
ti *tenantcapabilities.Entry,
7099
) error {
71-
multiTenant := ti != nil && !ti.TenantID.IsSystem()
100+
multiTenant := ti != nil && ti.TenantID.IsSet() && !ti.TenantID.IsSystem()
72101
scheduler := hotRangesLoggingScheduler{
73-
ie: ie,
74102
sServer: sServer,
75103
st: st,
76104
stopper: stopper,
77105
multiTenant: multiTenant,
106+
lastLogged: timeutil.Now(),
78107
}
79108

80109
if multiTenant {
81-
return scheduler.startJob(ctx, stopper)
110+
return scheduler.startJob()
82111
}
83112

84113
return scheduler.startTask(ctx, stopper)
85114
}
86115

116+
// startTask is for usage in a system-tenant or non-multi-tenant
117+
// installation.
87118
func (s *hotRangesLoggingScheduler) startTask(ctx context.Context, stopper *stop.Stopper) error {
88119
return stopper.RunAsyncTask(ctx, "hot-ranges-stats", func(ctx context.Context) {
89120
err := s.start(ctx, stopper)
90121
log.Warningf(ctx, "hot ranges stats logging scheduler stopped: %s", err)
91122
})
92123
}
93124

94-
func (s *hotRangesLoggingScheduler) startJob(ctx context.Context, stopper *stop.Stopper) error {
125+
func (s *hotRangesLoggingScheduler) startJob() error {
95126
jobs.RegisterConstructor(
96127
jobspb.TypeHotRangesLogger,
97128
func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
@@ -103,57 +134,99 @@ func (s *hotRangesLoggingScheduler) startJob(ctx context.Context, stopper *stop.
103134
}
104135

105136
func (s *hotRangesLoggingScheduler) start(ctx context.Context, stopper *stop.Stopper) error {
106-
intervalChangedChan := make(chan struct{})
107-
// We have to register this callback first. Otherwise we may run into
108-
// an unlikely but possible scenario where we've started the ticker,
109-
// and the setting is changed before we register the callback and the
110-
// ticker will not be reset to the new value.
111-
TelemetryHotRangesStatsInterval.SetOnChange(&s.st.SV, func(ctx context.Context) {
112-
intervalChangedChan <- struct{}{}
113-
})
114-
115-
ticker := time.NewTicker(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
116-
117137
for {
138+
ci := CheckInterval
139+
if s.multiTenant {
140+
ci *= 5
141+
}
118142
select {
119143
case <-stopper.ShouldQuiesce():
120144
return nil
121145
case <-ctx.Done():
122146
return nil
123-
case <-ticker.C:
147+
case <-time.After(ci):
124148
s.maybeLogHotRanges(ctx, stopper)
125-
case <-intervalChangedChan:
126-
ticker.Reset(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
149+
case <-TestLoopChannel:
150+
continue
127151
}
128152
}
129153
}
130154

131155
// maybeLogHotRanges is a small helper function which couples the
132156
// functionality of checking whether to log and logging.
133157
func (s *hotRangesLoggingScheduler) maybeLogHotRanges(ctx context.Context, stopper *stop.Stopper) {
134-
if s.shouldLog() {
158+
if s.shouldLog(ctx) {
135159
s.logHotRanges(ctx, stopper)
160+
s.lastLogged = timeutil.Now()
161+
}
162+
}
163+
164+
// shouldLog checks the below conditions to see whether it
165+
// should emit logs.
166+
//
167+
// To return true, we verify that both:
168+
// - The logging setting is enabled.
169+
// - One of the following conditions is met:
170+
// -- It's been greater than the log interval since we last logged.
171+
// -- One of the replicas see exceeds our cpu threshold.
172+
func (s *hotRangesLoggingScheduler) shouldLog(ctx context.Context) bool {
173+
enabled := TelemetryHotRangesStatsEnabled.Get(&s.st.SV)
174+
if !enabled {
175+
return false
176+
}
177+
178+
logInterval := TelemetryHotRangesStatsInterval.Get(&s.st.SV)
179+
if timeutil.Since(s.lastLogged) > logInterval {
180+
return true
136181
}
182+
183+
// Getting the hot ranges with the statsOnly flag will
184+
// ensure the call doesn't touch the keyspace. Therefore
185+
// drastically lightening the overhead of fetching them.
186+
resp, err := s.getHotRanges(context.Background(), true)
187+
if err != nil {
188+
log.Warningf(ctx, "failed to get hot ranges: %s", err)
189+
return false
190+
}
191+
cpuThreshold := TelemetryHotRangesStatsCPUThreshold.Get(&s.st.SV)
192+
return maxCPU(resp.Ranges) > cpuThreshold
137193
}
138194

139-
// shouldLog checks the below conditions to see whether it should emit logs.
140-
// - Is the cluster setting server.telemetry.hot_ranges_stats.enabled true?
141-
func (s *hotRangesLoggingScheduler) shouldLog() bool {
142-
return TelemetryHotRangesStatsEnabled.Get(&s.st.SV)
195+
func maxCPU(ranges []*serverpb.HotRangesResponseV2_HotRange) time.Duration {
196+
maxSeen := float64(0)
197+
for _, r := range ranges {
198+
if r.CPUTimePerSecond > maxSeen {
199+
maxSeen = r.CPUTimePerSecond
200+
}
201+
}
202+
return time.Duration(maxSeen)
143203
}
144204

145-
// logHotRanges collects the hot ranges from this node's status server and
146-
// sends them to the TELEMETRY log channel.
147-
func (s *hotRangesLoggingScheduler) logHotRanges(ctx context.Context, stopper *stop.Stopper) {
148-
req := &serverpb.HotRangesRequest{}
205+
// getHotRanges is a simple utility function for making a hot ranges
206+
// request to the status server. It can be used to fetch only the
207+
// stats for ranges requested, or everything. It also determines
208+
// whether to limit the request to only the local node, or to
209+
// issue a fanout for multi-tenant apps.
210+
func (s *hotRangesLoggingScheduler) getHotRanges(
211+
ctx context.Context, statsOnly bool,
212+
) (*serverpb.HotRangesResponseV2, error) {
213+
req := &serverpb.HotRangesRequest{
214+
PerNodeLimit: ReportTopHottestRanges,
215+
StatsOnly: statsOnly,
216+
}
149217

150218
// if we are running in single tenant mode, only log the ranges on the status server.
151219
if !s.multiTenant {
152220
req.Nodes = []string{"local"}
153-
req.PageSize = ReportTopHottestRanges
154221
}
155222

156-
resp, err := s.sServer.HotRangesV2(ctx, req)
223+
return s.sServer.HotRangesV2(ctx, req)
224+
}
225+
226+
// logHotRanges collects the hot ranges from this node's status server and
227+
// sends them to the HEALTH log channel.
228+
func (s *hotRangesLoggingScheduler) logHotRanges(ctx context.Context, stopper *stop.Stopper) {
229+
resp, err := s.getHotRanges(ctx, false)
157230
if err != nil {
158231
log.Warningf(ctx, "failed to get hot ranges: %s", err)
159232
return

pkg/server/structlogging/hot_ranges_log_job.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@ import (
1313
"github.com/cockroachdb/errors"
1414
)
1515

16+
// hot_ranges_log_job.go adds the required functions to satisfy
17+
// the jobs.Scheduler interface for the hot ranges logging job.
18+
// This is only required for app tenants in a multi-tenant deployment
19+
// as the app tenants have no notion of "local" ranges, and therefore
20+
// require a fanout to be performed to collect the hot ranges.
21+
// It's run as a job, as since fanout is required, only one node
22+
// needs to run it at any given time, as opposed to the every
23+
// node task behavior otherwise.
1624
func (s *hotRangesLoggingScheduler) Resume(ctx context.Context, execCtxI interface{}) error {
1725
// This job is a forever running background job, and it is always safe to
1826
// terminate the SQL pod whenever the job is running, so mark it as idle.

0 commit comments

Comments
 (0)