structlogging: properly register hot ranges logging job

angles-n-daemons · angles-n-daemons · commit 737e32ce71ba · 2025-06-30T09:03:48.000-04:00
There's a test failure in the hot range logging job which indicates that there's a race condition in how the job's resumer is registered. After some discussion, this PR sets up job registration to be more in line with how the job API is meant to be interfaced with. Fixes: #149041 Epic: None Release note: none
diff --git a/pkg/server/structlogging/BUILD.bazel b/pkg/server/structlogging/BUILD.bazel
@@ -15,6 +15,7 @@ go_library(
         "//pkg/server/serverpb",
         "//pkg/settings",
         "//pkg/settings/cluster",
+        "//pkg/sql",
         "//pkg/util/log",
         "//pkg/util/log/eventpb",
         "//pkg/util/log/logpb",
diff --git a/pkg/server/structlogging/hot_ranges_log.go b/pkg/server/structlogging/hot_ranges_log.go
@@ -9,8 +9,6 @@ import (
 	"context"
 	"time"
 
-	"github.com/cockroachdb/cockroach/pkg/jobs"
-	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilities"
 	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
@@ -73,13 +71,11 @@ type HotRangeGetter interface {
 	HotRangesV2(ctx context.Context, req *serverpb.HotRangesRequest) (*serverpb.HotRangesResponseV2, error)
 }
 
-// hotRangesLoggingScheduler is responsible for logging index usage stats
+// hotRangesLogger is responsible for logging index usage stats
 // on a scheduled interval.
-type hotRangesLoggingScheduler struct {
+type hotRangesLogger struct {
 	sServer     HotRangeGetter
 	st          *cluster.Settings
-	stopper     *stop.Stopper
-	job         *jobs.Job
 	multiTenant bool
 	lastLogged  time.Time
 }
@@ -90,8 +86,8 @@ type hotRangesLoggingScheduler struct {
 // For system tenants, or single tenant deployments, it runs as
 // a task on each node, logging only the ranges on the node in
 // which it runs. For app tenants in a multi-tenant deployment,
-// it runs on a single node in the sql cluster, applying a fanout
-// to the kv layer to collect the hot ranges from all nodes.
+// it does nothing, allowing the hot range logging job to be the
+// entrypoint.
 func StartHotRangesLoggingScheduler(
 	ctx context.Context,
 	stopper *stop.Stopper,
@@ -100,42 +96,30 @@ func StartHotRangesLoggingScheduler(
 	ti *tenantcapabilities.Entry,
 ) error {
 	multiTenant := ti != nil && ti.TenantID.IsSet() && !ti.TenantID.IsSystem()
-	scheduler := hotRangesLoggingScheduler{
+
+	if multiTenant {
+		return nil
+	}
+
+	logger := hotRangesLogger{
 		sServer:     sServer,
 		st:          st,
-		stopper:     stopper,
-		multiTenant: multiTenant,
+		multiTenant: false,
 		lastLogged:  timeutil.Now(),
 	}
 
-	if multiTenant {
-		return scheduler.startJob()
-	}
-
-	return scheduler.startTask(ctx, stopper)
+	return logger.startTask(ctx, stopper)
 }
 
 // startTask is for usage in a system-tenant or non-multi-tenant
 // installation.
-func (s *hotRangesLoggingScheduler) startTask(ctx context.Context, stopper *stop.Stopper) error {
+func (s *hotRangesLogger) startTask(ctx context.Context, stopper *stop.Stopper) error {
 	return stopper.RunAsyncTask(ctx, "hot-ranges-stats", func(ctx context.Context) {
 		s.start(ctx, stopper)
 	})
 }
 
-func (s *hotRangesLoggingScheduler) startJob() error {
-	jobs.RegisterConstructor(
-		jobspb.TypeHotRangesLogger,
-		func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
-			s.job = job
-			return s
-		},
-		jobs.DisablesTenantCostControl,
-	)
-	return nil
-}
-
-func (s *hotRangesLoggingScheduler) start(ctx context.Context, stopper *stop.Stopper) {
+func (s *hotRangesLogger) start(ctx context.Context, stopper *stop.Stopper) {
 	for {
 		ci := CheckInterval
 		if s.multiTenant {
@@ -156,7 +140,7 @@ func (s *hotRangesLoggingScheduler) start(ctx context.Context, stopper *stop.Sto
 
 // maybeLogHotRanges is a small helper function which couples the
 // functionality of checking whether to log and logging.
-func (s *hotRangesLoggingScheduler) maybeLogHotRanges(ctx context.Context, stopper *stop.Stopper) {
+func (s *hotRangesLogger) maybeLogHotRanges(ctx context.Context, stopper *stop.Stopper) {
 	if s.shouldLog(ctx) {
 		s.logHotRanges(ctx, stopper)
 		s.lastLogged = timeutil.Now()
@@ -171,7 +155,7 @@ func (s *hotRangesLoggingScheduler) maybeLogHotRanges(ctx context.Context, stopp
 //	   - One of the following conditions is met:
 //		   -- It's been greater than the log interval since we last logged.
 //		   -- One of the replicas see exceeds our cpu threshold.
-func (s *hotRangesLoggingScheduler) shouldLog(ctx context.Context) bool {
+func (s *hotRangesLogger) shouldLog(ctx context.Context) bool {
 
 	enabled := TelemetryHotRangesStatsEnabled.Get(&s.st.SV)
 	if !enabled {
@@ -210,7 +194,7 @@ func maxCPU(ranges []*serverpb.HotRangesResponseV2_HotRange) time.Duration {
 // stats for ranges requested, or everything. It also determines
 // whether to limit the request to only the local node, or to
 // issue a fanout for multi-tenant apps.
-func (s *hotRangesLoggingScheduler) getHotRanges(
+func (s *hotRangesLogger) getHotRanges(
 	ctx context.Context, statsOnly bool,
 ) (*serverpb.HotRangesResponseV2, error) {
 	req := &serverpb.HotRangesRequest{
@@ -228,7 +212,7 @@ func (s *hotRangesLoggingScheduler) getHotRanges(
 
 // logHotRanges collects the hot ranges from this node's status server and
 // sends them to the HEALTH log channel.
-func (s *hotRangesLoggingScheduler) logHotRanges(ctx context.Context, stopper *stop.Stopper) {
+func (s *hotRangesLogger) logHotRanges(ctx context.Context, stopper *stop.Stopper) {
 	resp, err := s.getHotRanges(ctx, false)
 	if err != nil {
 		log.Warningf(ctx, "failed to get hot ranges: %s", err)
diff --git a/pkg/server/structlogging/hot_ranges_log_job.go b/pkg/server/structlogging/hot_ranges_log_job.go
@@ -9,10 +9,19 @@ import (
 	"context"
 
 	"github.com/cockroachdb/cockroach/pkg/jobs"
+	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
+	"github.com/cockroachdb/cockroach/pkg/sql"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/cockroachdb/errors"
 )
 
+type hotRangesLoggingJob struct {
+	job      *jobs.Job
+	settings *cluster.Settings
+}
+
 // hot_ranges_log_job.go adds the required functions to satisfy
 // the jobs.Scheduler interface for the hot ranges logging job.
 // This is only required for app tenants in a multi-tenant deployment
@@ -21,16 +30,24 @@ import (
 // It's run as a job, as since fanout is required, only one node
 // needs to run it at any given time, as opposed to the every
 // node task behavior otherwise.
-func (s *hotRangesLoggingScheduler) Resume(ctx context.Context, execCtxI interface{}) error {
+func (j *hotRangesLoggingJob) Resume(ctx context.Context, execCtxI interface{}) error {
 	// This job is a forever running background job, and it is always safe to
 	// terminate the SQL pod whenever the job is running, so mark it as idle.
-	s.job.MarkIdle(true)
+	j.job.MarkIdle(true)
 
-	s.start(ctx, s.stopper)
+	jobExec := execCtxI.(sql.JobExecContext)
+	execCfg := jobExec.ExecCfg()
+	logger := &hotRangesLogger{
+		sServer:     execCfg.TenantStatusServer,
+		st:          j.settings,
+		multiTenant: true,
+		lastLogged:  timeutil.Now(),
+	}
+	logger.start(ctx, execCfg.Stopper)
 	return nil
 }
 
-func (s *hotRangesLoggingScheduler) OnFailOrCancel(
+func (j *hotRangesLoggingJob) OnFailOrCancel(
 	ctx context.Context, execCtx interface{}, jobErr error,
 ) error {
 	if jobs.HasErrJobCanceled(jobErr) {
@@ -42,6 +59,19 @@ func (s *hotRangesLoggingScheduler) OnFailOrCancel(
 	return nil
 }
 
-func (s *hotRangesLoggingScheduler) CollectProfile(ctx context.Context, execCtx interface{}) error {
+func (j *hotRangesLoggingJob) CollectProfile(ctx context.Context, execCtx interface{}) error {
 	return nil
 }
+
+func init() {
+	jobs.RegisterConstructor(
+		jobspb.TypeHotRangesLogger,
+		func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
+			return &hotRangesLoggingJob{
+				job:      job,
+				settings: settings,
+			}
+		},
+		jobs.DisablesTenantCostControl,
+	)
+}