From f36dc98e5ddeaedab65b26ac1754224f168cab18 Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Mon, 22 Dec 2025 15:05:15 -0500 Subject: [PATCH] jobs: time out claim jobs query after 1 minute by default The claim query should take less than a second, but we have seen it hang for multiple hours because the underlying transaction continuously retries and deadlocks. To prevent this, this patch causes the claim query to timeout after a minute by default, set by the private cluster setting jobs.registry.claim_query.timeout. The per node claim loop will try again in the next iteration. Informs #158976 Release note: none --- pkg/jobs/adopt.go | 5 +++++ pkg/jobs/config.go | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/pkg/jobs/adopt.go b/pkg/jobs/adopt.go index f234fc213404..671fe949acf0 100644 --- a/pkg/jobs/adopt.go +++ b/pkg/jobs/adopt.go @@ -95,6 +95,11 @@ func (r *Registry) maybeDumpTrace(resumerCtx context.Context, resumer Resumer, j // claimJobs places a claim with the given SessionID to job rows that are // available. func (r *Registry) claimJobs(ctx context.Context, s sqlliveness.Session) error { + + timeout := claimQueryTimeout.Get(&r.settings.SV) + ctx, cancel := context.WithDeadlineCause(ctx, timeutil.Now().Add(timeout), errors.New("claim jobs transaction took too long")) + defer cancel() + return r.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { // Run the claim transaction at low priority to ensure that it does not // contend with foreground reads. diff --git a/pkg/jobs/config.go b/pkg/jobs/config.go index 5a07d5dc3e5d..732bcb06182c 100644 --- a/pkg/jobs/config.go +++ b/pkg/jobs/config.go @@ -24,6 +24,7 @@ const ( cancelUpdateLimitKey = "jobs.cancel_update_limit" debugPausePointsSettingKey = "jobs.debug.pausepoints" metricsPollingIntervalKey = "jobs.metrics.interval.poll" + claimQueryTimeoutKey = "jobs.registry.claim_query.timeout" ) const ( @@ -134,6 +135,14 @@ var ( "the list, comma separated, of named pausepoints currently enabled for debugging", "", ) + + claimQueryTimeout = settings.RegisterDurationSetting( + settings.ApplicationLevel, + claimQueryTimeoutKey, + "the timeout for the claim query used when adopting jobs", + time.Minute, + settings.PositiveDuration, + ) ) // jitter adds a small jitter in the given duration.