Skip to content

Commit e074bcf

Browse files
craig[bot]angles-n-daemons
andcommitted
Merge #149428
149428: structlogging: fix hot ranges logging job exit mechanism r=angles-n-daemons a=angles-n-daemons We configured the hot ranges logging job (to be released in 25.3) as a forever background job, which means its meant to run forever, and only for app tenants. However there were two issues with our current implementation. The first is that it runs both for app tenants and system tenants, and second is that for the app tenants, on quiescence it exists with a nil error. In order to run forever, quiescence requires the job to error, so that it is picked up by another node when a node is exiting. Fixes: none Epic: none Release note: none Co-authored-by: Brian Dillmann <[email protected]>
2 parents 9e36b27 + 80d454f commit e074bcf

File tree

3 files changed

+69
-1
lines changed

3 files changed

+69
-1
lines changed

pkg/server/structlogging/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ go_library(
2929
go_test(
3030
name = "structlogging_test",
3131
srcs = [
32+
"hot_ranges_log_job_test.go",
3233
"hot_ranges_log_test.go",
3334
"main_test.go",
3435
],
@@ -38,6 +39,8 @@ go_test(
3839
}),
3940
deps = [
4041
":structlogging",
42+
"//pkg/base",
43+
"//pkg/jobs",
4144
"//pkg/security/securityassets",
4245
"//pkg/security/securitytest",
4346
"//pkg/server",

pkg/server/structlogging/hot_ranges_log_job.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,22 @@ func (j *hotRangesLoggingJob) Resume(ctx context.Context, execCtxI interface{})
3737

3838
jobExec := execCtxI.(sql.JobExecContext)
3939
execCfg := jobExec.ExecCfg()
40+
41+
// Do not run this job for the system tenant.
42+
if execCfg.Codec.ForSystemTenant() {
43+
return nil
44+
}
45+
4046
logger := &hotRangesLogger{
4147
sServer: execCfg.TenantStatusServer,
4248
st: j.settings,
4349
multiTenant: true,
4450
lastLogged: timeutil.Now(),
4551
}
4652
logger.start(ctx, execCfg.Stopper)
47-
return nil
53+
54+
// Signal to the job system to pick this job back up.
55+
return jobs.MarkAsRetryJobError(errors.New("failing hot ranges job so that it is restarted"))
4856
}
4957

5058
func (j *hotRangesLoggingJob) OnFailOrCancel(
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package structlogging_test
7+
8+
import (
9+
"context"
10+
"testing"
11+
12+
"github.com/cockroachdb/cockroach/pkg/base"
13+
"github.com/cockroachdb/cockroach/pkg/jobs"
14+
"github.com/cockroachdb/cockroach/pkg/testutils"
15+
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
16+
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
17+
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
18+
"github.com/cockroachdb/cockroach/pkg/util/log"
19+
"github.com/cockroachdb/errors"
20+
"github.com/stretchr/testify/require"
21+
)
22+
23+
// This test verifies that the hot ranges logging job starts for both
24+
// the system and app layers, and exits quickly for the system layer.
25+
func TestHotRangesLoggingJobExitProcedure(t *testing.T) {
26+
defer leaktest.AfterTest(t)()
27+
defer log.Scope(t).Close(t)
28+
skip.UnderStress(t)
29+
skip.UnderRace(t)
30+
31+
ctx := context.Background()
32+
ts := serverutils.StartServerOnly(t, base.TestServerArgs{
33+
Knobs: base.TestingKnobs{
34+
JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(),
35+
},
36+
})
37+
defer ts.Stopper().Stop(ctx)
38+
39+
syslayer := ts.SystemLayer().SQLConn(t)
40+
applayer := ts.ApplicationLayer().SQLConn(t)
41+
42+
testutils.SucceedsSoon(t, func() error {
43+
row := syslayer.QueryRow("SELECT status FROM system.public.jobs WHERE id = $1", jobs.HotRangesLoggerJobID)
44+
var status string
45+
require.NoError(t, row.Scan(&status))
46+
if status != "succeeded" {
47+
return errors.Newf("system job status is %s, not succeeded", status)
48+
}
49+
50+
row = applayer.QueryRow("SELECT status FROM system.public.jobs WHERE id = $1", jobs.HotRangesLoggerJobID)
51+
require.NoError(t, row.Scan(&status))
52+
if status != "running" {
53+
return errors.Newf("app job status is %s, not running", status)
54+
}
55+
return nil
56+
})
57+
}

0 commit comments

Comments
 (0)