Skip to content

Commit 755545b

Browse files
craig[bot]spilchen
andcommitted
Merge #152618
152618: ttl: add checkpointing to TTL job r=spilchen a=spilchen Introduce a progress tracker for TTL jobs that supports checkpointing. Previously, if a TTL job restarted, it would reprocess spans that had already been completed. With this change, completed spans are stored in the job record so that restarts can skip over them, avoiding duplicate work. Fixes #140514 Epic: none Release note (performance improvement): TTL jobs now checkpoint their progress, allowing them to resume without reprocessing already completed spans after a restart. Co-authored-by: Matt Spilchen <[email protected]>
2 parents e5ebcac + 7fcbd48 commit 755545b

File tree

11 files changed

+1005
-37
lines changed

11 files changed

+1005
-37
lines changed

pkg/jobs/jobspb/jobs.proto

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1321,8 +1321,10 @@ message RowLevelTTLDetails {
13211321
}
13221322

13231323
message RowLevelTTLProgress {
1324-
1325-
// JobDeletedRowCount is the number of rows deleted by TTL job so far.
1324+
// JobDeletedRowCount is the number of rows deleted by TTL job so far. If
1325+
// UseCheckpointing is true, then this is the deleted row count across all
1326+
// restarts of the job. If that is false, this is the delete row count in
1327+
// the current running of the job.
13261328
int64 job_deleted_row_count = 1;
13271329

13281330
// ProcessorProgresses is the progress per DistSQL processor.
@@ -1331,12 +1333,18 @@ message RowLevelTTLProgress {
13311333
// UseDistSQL is no longer used in v23.1+ as all TTL jobs are using DistSQL.
13321334
reserved 3;
13331335

1334-
// JobTotalSpanCount is the number of spans for the entire TTL job.
1335-
int64 job_total_span_count = 4;
1336+
// JobTotalSpanCount is the number of spans for the entire TTL job. This is
1337+
// deprecated and not used if UseCheckpointing is true.
1338+
int64 job_total_span_count = 4 [deprecated = true];
13361339

13371340
// JobProcessedSpanCount is the number of spans that have been processed by
1338-
// the TTL job so far.
1339-
int64 job_processed_span_count = 5;
1341+
// the TTL job so far. This is deprecated and not used if UseCheckpointing is
1342+
// true.
1343+
int64 job_processed_span_count = 5 [deprecated = true];
1344+
1345+
// UseCheckpointing is true if the TTL job will use checkpointing for progress
1346+
// tracking.
1347+
bool use_checkpointing = 6;
13401348
}
13411349

13421350
message RowLevelTTLProcessorProgress {
@@ -1354,11 +1362,13 @@ message RowLevelTTLProcessorProgress {
13541362
// DeletedRowCount is the number of rows deleted by this DistSQL processor.
13551363
int64 deleted_row_count = 3;
13561364

1357-
// TotalSpanCount is the total number of spans assigned to the DistSQL processor.
1358-
int64 total_span_count = 4;
1365+
// TotalSpanCount is the total number of spans assigned to the DistSQL
1366+
// processor. This is deprecated and not used if using checkpointing.
1367+
int64 total_span_count = 4 [deprecated = true];
13591368

1360-
// ProcessedSpanCount is the number of spans already processed.
1361-
int64 processed_span_count = 6;
1369+
// ProcessedSpanCount is the number of spans already processed. This is
1370+
// deprecated when using checkpointing.
1371+
int64 processed_span_count = 6 [deprecated = true];
13621372

13631373
// ProcessorConcurrency is the number parallel tasks the processor will do at once.
13641374
int64 processor_concurrency = 5;

pkg/sql/spanutils/query_bounds.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ type QueryBounds struct {
4040
// span's end key is exclusive because the end bounds are based on the first
4141
// row < Span.EndKey.
4242
End tree.Datums
43+
// Span is the original span that these query bounds were derived from.
44+
// This preserves the source span information for reference.
45+
Span roachpb.Span
4346
}
4447

4548
var (
@@ -110,6 +113,7 @@ func SpanToQueryBounds(
110113
if err != nil {
111114
return bounds, false, errors.Wrapf(err, "decode endKeyValues error on %+v", endKeyValues)
112115
}
116+
bounds.Span = span
113117
return bounds, true, nil
114118
}
115119

pkg/sql/ttl/ttljob/BUILD.bazel

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ go_library(
1515
"//pkg/base",
1616
"//pkg/jobs",
1717
"//pkg/jobs/joberror",
18+
"//pkg/jobs/jobfrontier",
1819
"//pkg/jobs/jobspb",
1920
"//pkg/kv",
2021
"//pkg/roachpb",
@@ -53,12 +54,14 @@ go_library(
5354
"//pkg/util/protoutil",
5455
"//pkg/util/quotapool",
5556
"//pkg/util/retry",
57+
"//pkg/util/span",
5658
"//pkg/util/syncutil",
5759
"//pkg/util/timeutil",
5860
"@com_github_cockroachdb_errors//:errors",
5961
"@com_github_cockroachdb_redact//:redact",
6062
"@com_github_gogo_protobuf//types",
6163
"@com_github_prometheus_client_model//go",
64+
"@org_golang_x_sync//errgroup",
6265
],
6366
)
6467

@@ -83,6 +86,7 @@ go_test(
8386
"//pkg/ccl/kvccl/kvtenantccl",
8487
"//pkg/clusterversion",
8588
"//pkg/jobs",
89+
"//pkg/jobs/jobfrontier",
8690
"//pkg/jobs/jobspb",
8791
"//pkg/jobs/jobstest",
8892
"//pkg/keys",

0 commit comments

Comments
 (0)