structlogging: conditionally log hot ranges if the node is burdened

angles-n-daemons · angles-n-daemons · commit e0d4276123f0 · 2025-04-23T16:46:53.000-04:00
Prior to this change, if the system were enabled to, the hot ranges would log on a regular interval from each node in the system. This interval however was long (4h by default) and would leave a lot of gaps in the system, any hotspot which wasn't effectively continuous was likely to be missed from it. So that partner teams have better historical visibility into the system, we here add a check every minute for whether the node is in a "burdened" state. Burdened here is defined as any given range using greater than 250ms cpu time per second. If it is, we log the hot ranges for that node. Doing so allows for both better precision, if a range is hot, it's likely to be logged, due to the minute by minute check, without sacrificing recall, in that these decisions are made at a node local level. For multi-tenant deployments, this check happens instead once every 5 minutes, and polls & logs the entire cluster. Fixes: #138767 Epic: CRDB-43150 Release note (ops change): If the cluster setting `server.telemetry.hot_ranges_stats.enabled` is enabled,
diff --git a/pkg/server/status.go b/pkg/server/status.go
@@ -3059,6 +3059,7 @@ func (s *systemStatusServer) localHotRanges(
 	slices.SortFunc(resp.Ranges, func(a, b *serverpb.HotRangesResponseV2_HotRange) int {
 		return cmp.Compare(a.CPUTimePerSecond, b.CPUTimePerSecond)
 	})
+
 	// truncate the response if localLimit is set
 	if localLimit != 0 && localLimit < len(resp.Ranges) {
 		resp.Ranges = resp.Ranges[:localLimit]
diff --git a/pkg/server/structlogging/BUILD.bazel b/pkg/server/structlogging/BUILD.bazel
@@ -41,9 +41,11 @@ go_test(
         "//pkg/base",
         "//pkg/kv/kvserver",
         "//pkg/kv/kvserver/allocator/plan",
+        "//pkg/roachpb",
         "//pkg/security/securityassets",
         "//pkg/security/securitytest",
         "//pkg/server",
+        "//pkg/settings/cluster",
         "//pkg/testutils",
         "//pkg/testutils/serverutils",
         "//pkg/testutils/skip",
@@ -54,5 +56,6 @@ go_test(
         "//pkg/util/log/logpb",
         "//pkg/util/randutil",
         "//pkg/util/syncutil",
+        "@com_github_stretchr_testify//require",
     ],
 )
diff --git a/pkg/server/structlogging/hot_ranges_log.go b/pkg/server/structlogging/hot_ranges_log.go
@@ -25,7 +25,16 @@ import (
 )
 
 // ReportTopHottestRanges limits the number of ranges to be reported per iteration
-const ReportTopHottestRanges = 5
+var ReportTopHottestRanges int32 = 5
+
+// CheckInterval is the interval at which the system checks
+// whether or not to log the hot ranges.
+var CheckInterval = time.Second
+
+// TestLoopChannel triggers the hot ranges logging loop to start again.
+// It's useful in the context of a test, where we don't want to wait
+// for whatever the last time the interval was.
+var TestLoopChannel = make(chan struct{}, 1)
 
 var TelemetryHotRangesStatsInterval = settings.RegisterDurationSetting(
 	settings.ApplicationLevel,
@@ -48,18 +57,38 @@ var TelemetryHotRangesStatsLoggingDelay = settings.RegisterDurationSetting(
 	1*time.Second,
 )
 
+// TelemetryHotRangesStatsCPUThreshold defines the cpu duration
+// per second which needs to be exceeded for the system to automatically
+// log the hot ranges. It tracks the reasoning that the kv layer
+// uses to determine when to begin sampling reads for a given
+// range in the keyspace, more information found where the cluster
+// setting SplitByLoadCPUThreshold is defined.
+var TelemetryHotRangesStatsCPUThreshold = settings.RegisterDurationSetting(
+	settings.SystemOnly,
+	"server.telemetry.hot_ranges_stats.cpu_threshold",
+	"the cpu time over which the system will automatically begin logging hot ranges",
+	time.Second/4,
+)
+
 // hotRangesLoggingScheduler is responsible for logging index usage stats
 // on a scheduled interval.
 type hotRangesLoggingScheduler struct {
-	ie          sql.InternalExecutor
 	sServer     serverpb.TenantStatusServer
 	st          *cluster.Settings
 	stopper     *stop.Stopper
 	job         *jobs.Job
 	multiTenant bool
+	lastLogged  time.Time
 }
 
-// StartHotRangesLoggingScheduler starts the capture index usage statistics logging scheduler.
+// StartHotRangesLoggingScheduler starts the hot range log task
+// or job.
+//
+// For system tenants, or single tenant deployments, it runs as
+// a task on each node, logging only the ranges on the node in
+// which it runs. For app tenants in a multi-tenant deployment,
+// it runs on a single node in the sql cluster, applying a fanout
+// to the kv layer to collect the hot ranges from all nodes.
 func StartHotRangesLoggingScheduler(
 	ctx context.Context,
 	stopper *stop.Stopper,
@@ -68,30 +97,32 @@ func StartHotRangesLoggingScheduler(
 	st *cluster.Settings,
 	ti *tenantcapabilities.Entry,
 ) error {
-	multiTenant := ti != nil && !ti.TenantID.IsSystem()
+	multiTenant := ti != nil && ti.TenantID.IsSet() && !ti.TenantID.IsSystem()
 	scheduler := hotRangesLoggingScheduler{
-		ie:          ie,
 		sServer:     sServer,
 		st:          st,
 		stopper:     stopper,
 		multiTenant: multiTenant,
+		lastLogged:  timeutil.Now(),
 	}
 
 	if multiTenant {
-		return scheduler.startJob(ctx, stopper)
+		return scheduler.startJob()
 	}
 
 	return scheduler.startTask(ctx, stopper)
 }
 
+// startTask is for usage in a system-tenant or non-multi-tenant
+// installation.
 func (s *hotRangesLoggingScheduler) startTask(ctx context.Context, stopper *stop.Stopper) error {
 	return stopper.RunAsyncTask(ctx, "hot-ranges-stats", func(ctx context.Context) {
 		err := s.start(ctx, stopper)
 		log.Warningf(ctx, "hot ranges stats logging scheduler stopped: %s", err)
 	})
 }
 
-func (s *hotRangesLoggingScheduler) startJob(ctx context.Context, stopper *stop.Stopper) error {
+func (s *hotRangesLoggingScheduler) startJob() error {
 	jobs.RegisterConstructor(
 		jobspb.TypeHotRangesLogger,
 		func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
@@ -103,57 +134,99 @@ func (s *hotRangesLoggingScheduler) startJob(ctx context.Context, stopper *stop.
 }
 
 func (s *hotRangesLoggingScheduler) start(ctx context.Context, stopper *stop.Stopper) error {
-	intervalChangedChan := make(chan struct{})
-	// We have to register this callback first. Otherwise we may run into
-	// an unlikely but possible scenario where we've started the ticker,
-	// and the setting is changed before we register the callback and the
-	// ticker will not be reset to the new value.
-	TelemetryHotRangesStatsInterval.SetOnChange(&s.st.SV, func(ctx context.Context) {
-		intervalChangedChan <- struct{}{}
-	})
-
-	ticker := time.NewTicker(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
-
 	for {
+		ci := CheckInterval
+		if s.multiTenant {
+			ci *= 5
+		}
 		select {
 		case <-stopper.ShouldQuiesce():
 			return nil
 		case <-ctx.Done():
 			return nil
-		case <-ticker.C:
+		case <-time.After(ci):
 			s.maybeLogHotRanges(ctx, stopper)
-		case <-intervalChangedChan:
-			ticker.Reset(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
+		case <-TestLoopChannel:
+			continue
 		}
 	}
 }
 
 // maybeLogHotRanges is a small helper function which couples the
 // functionality of checking whether to log and logging.
 func (s *hotRangesLoggingScheduler) maybeLogHotRanges(ctx context.Context, stopper *stop.Stopper) {
-	if s.shouldLog() {
+	if s.shouldLog(ctx) {
 		s.logHotRanges(ctx, stopper)
+		s.lastLogged = timeutil.Now()
+	}
+}
+
+// shouldLog checks the below conditions to see whether it
+// should emit logs.
+//
+//		To return true, we verify that both:
+//		 - The logging setting is enabled.
+//	   - One of the following conditions is met:
+//		   -- It's been greater than the log interval since we last logged.
+//		   -- One of the replicas see exceeds our cpu threshold.
+func (s *hotRangesLoggingScheduler) shouldLog(ctx context.Context) bool {
+	enabled := TelemetryHotRangesStatsEnabled.Get(&s.st.SV)
+	if !enabled {
+		return false
+	}
+
+	logInterval := TelemetryHotRangesStatsInterval.Get(&s.st.SV)
+	if timeutil.Since(s.lastLogged) > logInterval {
+		return true
 	}
+
+	// Getting the hot ranges with the statsOnly flag will
+	// ensure the call doesn't touch the keyspace. Therefore
+	// drastically lightening the overhead of fetching them.
+	resp, err := s.getHotRanges(context.Background(), true)
+	if err != nil {
+		log.Warningf(ctx, "failed to get hot ranges: %s", err)
+		return false
+	}
+	cpuThreshold := TelemetryHotRangesStatsCPUThreshold.Get(&s.st.SV)
+	return maxCPU(resp.Ranges) > cpuThreshold
 }
 
-// shouldLog checks the below conditions to see whether it should emit logs.
-//   - Is the cluster setting server.telemetry.hot_ranges_stats.enabled true?
-func (s *hotRangesLoggingScheduler) shouldLog() bool {
-	return TelemetryHotRangesStatsEnabled.Get(&s.st.SV)
+func maxCPU(ranges []*serverpb.HotRangesResponseV2_HotRange) time.Duration {
+	maxSeen := float64(0)
+	for _, r := range ranges {
+		if r.CPUTimePerSecond > maxSeen {
+			maxSeen = r.CPUTimePerSecond
+		}
+	}
+	return time.Duration(maxSeen)
 }
 
-// logHotRanges collects the hot ranges from this node's status server and
-// sends them to the TELEMETRY log channel.
-func (s *hotRangesLoggingScheduler) logHotRanges(ctx context.Context, stopper *stop.Stopper) {
-	req := &serverpb.HotRangesRequest{}
+// getHotRanges is a simple utility function for making a hot ranges
+// request to the status server. It can be used to fetch only the
+// stats for ranges requested, or everything. It also determines
+// whether to limit the request to only the local node, or to
+// issue a fanout for multi-tenant apps.
+func (s *hotRangesLoggingScheduler) getHotRanges(
+	ctx context.Context, statsOnly bool,
+) (*serverpb.HotRangesResponseV2, error) {
+	req := &serverpb.HotRangesRequest{
+		PerNodeLimit: ReportTopHottestRanges,
+		StatsOnly:    statsOnly,
+	}
 
 	// if we are running in single tenant mode, only log the ranges on the status server.
 	if !s.multiTenant {
 		req.Nodes = []string{"local"}
-		req.PageSize = ReportTopHottestRanges
 	}
 
-	resp, err := s.sServer.HotRangesV2(ctx, req)
+	return s.sServer.HotRangesV2(ctx, req)
+}
+
+// logHotRanges collects the hot ranges from this node's status server and
+// sends them to the HEALTH log channel.
+func (s *hotRangesLoggingScheduler) logHotRanges(ctx context.Context, stopper *stop.Stopper) {
+	resp, err := s.getHotRanges(ctx, false)
 	if err != nil {
 		log.Warningf(ctx, "failed to get hot ranges: %s", err)
 		return
diff --git a/pkg/server/structlogging/hot_ranges_log_job.go b/pkg/server/structlogging/hot_ranges_log_job.go
@@ -13,6 +13,14 @@ import (
 	"github.com/cockroachdb/errors"
 )
 
+// hot_ranges_log_job.go adds the required functions to satisfy
+// the jobs.Scheduler interface for the hot ranges logging job.
+// This is only required for app tenants in a multi-tenant deployment
+// as the app tenants have no notion of "local" ranges, and therefore
+// require a fanout to be performed to collect the hot ranges.
+// It's run as a job, as since fanout is required, only one node
+// needs to run it at any given time, as opposed to the every
+// node task behavior otherwise.
 func (s *hotRangesLoggingScheduler) Resume(ctx context.Context, execCtxI interface{}) error {
 	// This job is a forever running background job, and it is always safe to
 	// terminate the SQL pod whenever the job is running, so mark it as idle.
diff --git a/pkg/server/structlogging/hot_ranges_log_test.go b/pkg/server/structlogging/hot_ranges_log_test.go