Skip to content

Commit 17fae06

Browse files
committed
backup: create backup index file on backup completion
This commit teaches backup to write a backup index file upon completion of the backup as outlined in the [design doc](https://docs.google.com/document/d/1zXGQ7c9jvKmatO64jVKNrO5BBODAE38y-TqNTeb81yA). Epic: CRDB-47942 Fixes: #150326 Release note: None
1 parent a685bfb commit 17fae06

File tree

12 files changed

+637
-30
lines changed

12 files changed

+637
-30
lines changed

pkg/backup/backup_job.go

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -126,19 +126,17 @@ func filterSpans(includes []roachpb.Span, excludes []roachpb.Span) []roachpb.Spa
126126
func backup(
127127
ctx context.Context,
128128
execCtx sql.JobExecContext,
129-
defaultURI string,
130-
urisByLocalityKV map[string]string,
129+
details jobspb.BackupDetails,
131130
settings *cluster.Settings,
132131
defaultStore cloud.ExternalStorage,
133132
storageByLocalityKV map[string]*cloudpb.ExternalStorage,
134133
resumer *backupResumer,
135134
backupManifest *backuppb.BackupManifest,
136135
makeExternalStorage cloud.ExternalStorageFactory,
137-
encryption *jobspb.BackupEncryptionOptions,
138-
execLocality roachpb.Locality,
139136
) (_ roachpb.RowCount, numBackupInstances int, _ error) {
140137
resumerSpan := tracing.SpanFromContext(ctx)
141138
var lastCheckpoint time.Time
139+
encryption := details.EncryptionOptions
142140

143141
kmsEnv := backupencryption.MakeBackupKMSEnv(
144142
execCtx.ExecCfg().Settings,
@@ -170,14 +168,16 @@ func backup(
170168
oracle := physicalplan.DefaultReplicaChooser
171169
if useBulkOracle.Get(&evalCtx.Settings.SV) {
172170
oracle = kvfollowerreadsccl.NewBulkOracle(
173-
dsp.ReplicaOracleConfig(evalCtx.Locality), execLocality, kvfollowerreadsccl.StreakConfig{},
171+
dsp.ReplicaOracleConfig(evalCtx.Locality),
172+
details.ExecutionLocality,
173+
kvfollowerreadsccl.StreakConfig{},
174174
)
175175
}
176176

177177
// We don't return the compatible nodes here since PartitionSpans will
178178
// filter out incompatible nodes.
179179
planCtx, _, err := dsp.SetupAllNodesPlanningWithOracle(
180-
ctx, evalCtx, execCtx.ExecCfg(), oracle, execLocality,
180+
ctx, evalCtx, execCtx.ExecCfg(), oracle, details.ExecutionLocality,
181181
)
182182
if err != nil {
183183
return roachpb.RowCount{}, 0, errors.Wrap(err, "failed to determine nodes on which to run")
@@ -193,8 +193,8 @@ func backup(
193193
spans,
194194
introducedSpans,
195195
pkIDs,
196-
defaultURI,
197-
urisByLocalityKV,
196+
details.URI,
197+
details.URIsByLocalityKV,
198198
encryption,
199199
&kmsEnv,
200200
kvpb.MVCCFilter(backupManifest.MVCCFilter),
@@ -302,7 +302,7 @@ func backup(
302302
})
303303

304304
err := backupinfo.WriteBackupManifestCheckpoint(
305-
ctx, defaultURI, encryption, &kmsEnv, backupManifest, execCtx.ExecCfg(), execCtx.User(),
305+
ctx, details.URI, encryption, &kmsEnv, backupManifest, execCtx.ExecCfg(), execCtx.User(),
306306
)
307307
if err != nil {
308308
log.Errorf(ctx, "unable to checkpoint backup descriptor: %+v", err)
@@ -439,6 +439,15 @@ func backup(
439439
return roachpb.RowCount{}, 0, err
440440
}
441441

442+
if err := backupdest.WriteBackupIndexMetadata(
443+
ctx,
444+
execCtx.User(),
445+
execCtx.ExecCfg().DistSQLSrv.ExternalStorageFromURI,
446+
details,
447+
); err != nil {
448+
return roachpb.RowCount{}, 0, errors.Wrapf(err, "writing backup index metadata")
449+
}
450+
442451
return backupManifest.EntryCounts, numBackupInstances, nil
443452
}
444453

@@ -723,16 +732,13 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error {
723732
res, numBackupInstances, err = backup(
724733
ctx,
725734
p,
726-
details.URI,
727-
details.URIsByLocalityKV,
735+
details,
728736
p.ExecCfg().Settings,
729737
defaultStore,
730738
storageByLocalityKV,
731739
b,
732740
backupManifest,
733741
p.ExecCfg().DistSQLSrv.ExternalStorage,
734-
details.EncryptionOptions,
735-
details.ExecutionLocality,
736742
)
737743
if err == nil {
738744
break

pkg/backup/backup_test.go

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1708,6 +1708,10 @@ func TestBackupRestoreResume(t *testing.T) {
17081708
defer leaktest.AfterTest(t)()
17091709
defer log.Scope(t).Close(t)
17101710

1711+
// This test flakes in TC CI, but not under Engflow, so we're just skipping it
1712+
// under race.
1713+
skip.UnderRace(t, "flaky under race after #150384")
1714+
17111715
ctx := context.Background()
17121716

17131717
params := base.TestClusterArgs{ServerArgs: base.TestServerArgs{
@@ -1757,11 +1761,13 @@ func TestBackupRestoreResume(t *testing.T) {
17571761
if err := os.WriteFile(checkpointFile, mockManifest, 0644); err != nil {
17581762
t.Fatal(err)
17591763
}
1764+
uri := "nodelocal://1/backup" + "-" + item.testName
17601765
createAndWaitForJob(
17611766
t, sqlDB, []descpb.ID{backupTableDesc.GetID()},
17621767
jobspb.BackupDetails{
1763-
EndTime: srv.Clock().Now(),
1764-
URI: "nodelocal://1/backup" + "-" + item.testName,
1768+
EndTime: srv.Clock().Now(),
1769+
CollectionURI: uri,
1770+
URI: uri,
17651771
},
17661772
jobspb.BackupProgress{},
17671773
roachpb.Version{},
@@ -11241,3 +11247,65 @@ func TestRestoreFailureDeletesComments(t *testing.T) {
1124111247
sqlDB.QueryRow(t, commentCountQuery).Scan(&count)
1124211248
require.Equal(t, 0, count)
1124311249
}
11250+
11251+
func TestBackupIndexCreatedAfterBackup(t *testing.T) {
11252+
defer leaktest.AfterTest(t)()
11253+
defer log.Scope(t).Close(t)
11254+
11255+
ctx := context.Background()
11256+
th, cleanup := newTestHelper(t)
11257+
defer cleanup()
11258+
11259+
th.setOverrideAsOfClauseKnob(t)
11260+
// Time is set to a time such that no full backup will unexpectedly run as we
11261+
// artificially time travel. This ensures deterministic behavior that is not
11262+
// impacted by when the test runs.
11263+
th.env.SetTime(time.Date(2025, 7, 18, 0, 0, 0, 0, time.UTC))
11264+
11265+
th.sqlDB.Exec(t, "SET CLUSTER SETTING backup.compaction.threshold = 4")
11266+
th.sqlDB.Exec(t, "SET CLUSTER SETTING backup.compaction.window_size = 3")
11267+
11268+
schedules, err := th.createBackupSchedule(
11269+
t, "CREATE SCHEDULE FOR BACKUP INTO $1 RECURRING '@hourly'", "nodelocal://1/backup",
11270+
)
11271+
require.NoError(t, err)
11272+
require.Equal(t, 2, len(schedules))
11273+
11274+
full, inc := schedules[0], schedules[1]
11275+
if full.IsPaused() {
11276+
full, inc = inc, full
11277+
}
11278+
11279+
th.env.SetTime(full.NextRun().Add(time.Second))
11280+
require.NoError(t, th.executeSchedules())
11281+
th.waitForSuccessfulScheduledJob(t, full.ScheduleID())
11282+
11283+
var backupPath string
11284+
th.sqlDB.QueryRow(t, "SHOW BACKUPS IN 'nodelocal://1/backup'").Scan(&backupPath)
11285+
11286+
for range 3 {
11287+
inc, err = jobs.ScheduledJobDB(th.internalDB()).Load(ctx, th.env, inc.ScheduleID())
11288+
require.NoError(t, err)
11289+
11290+
th.env.SetTime(inc.NextRun().Add(time.Second))
11291+
require.NoError(t, th.executeSchedules())
11292+
th.waitForSuccessfulScheduledJob(t, inc.ScheduleID())
11293+
}
11294+
var compactionJob jobspb.JobID
11295+
require.NoError(
11296+
t, th.sqlDB.DB.QueryRowContext(
11297+
ctx,
11298+
`SELECT job_id FROM [SHOW JOBS] WHERE description ILIKE 'COMPACT%' AND job_type = 'BACKUP'`,
11299+
).Scan(&compactionJob),
11300+
)
11301+
jobutils.WaitForJobToSucceed(t, th.sqlDB, compactionJob)
11302+
11303+
fullIndexes, err := os.ReadDir(path.Join(th.iodir, "backup", backupbase.BackupIndexDirectoryPath))
11304+
require.NoError(t, err)
11305+
require.Len(t, fullIndexes, 1)
11306+
files, err := os.ReadDir(
11307+
path.Join(th.iodir, "backup", backupbase.BackupIndexDirectoryPath, fullIndexes[0].Name()),
11308+
)
11309+
require.NoError(t, err)
11310+
require.Len(t, files, 5)
11311+
}

pkg/backup/backupbase/constants.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,12 @@ const (
6363
// and groups all the data sst files in each backup, which start with "data/",
6464
// into a single result that can be skipped over quickly.
6565
ListingDelimDataSlash = "data/"
66+
67+
// BackupIndexDirectoryName is the path from the root of the backup collection
68+
// to the directory containing the index files for the backup collection.
69+
BackupIndexDirectoryPath = "index/"
70+
71+
// BackupIndexFilenameTimestampFormat is the format used for the human
72+
// readable start and end times in the index file names.
73+
BackupIndexFilenameTimestampFormat = "20060102-150405.00"
6674
)

pkg/backup/backupdest/BUILD.bazel

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go_library(
44
name = "backupdest",
55
srcs = [
66
"backup_destination.go",
7+
"backup_index.go",
78
"incrementals.go",
89
],
910
importpath = "github.com/cockroachdb/cockroach/pkg/backup/backupdest",
@@ -29,6 +30,7 @@ go_library(
2930
"//pkg/util/hlc",
3031
"//pkg/util/ioctx",
3132
"//pkg/util/mon",
33+
"//pkg/util/protoutil",
3234
"//pkg/util/timeutil",
3335
"//pkg/util/tracing",
3436
"@com_github_cockroachdb_errors//:errors",
@@ -39,36 +41,42 @@ go_test(
3941
name = "backupdest_test",
4042
srcs = [
4143
"backup_destination_test.go",
44+
"backup_index_test.go",
4245
"incrementals_test.go",
4346
"main_test.go",
4447
],
48+
embed = [":backupdest"],
4549
exec_properties = select({
4650
"//build/toolchains:is_heavy": {"test.Pool": "large"},
4751
"//conditions:default": {"test.Pool": "default"},
4852
}),
4953
deps = [
50-
":backupdest",
5154
"//pkg/backup/backupbase",
5255
"//pkg/backup/backuppb",
5356
"//pkg/backup/backuptestutils",
5457
"//pkg/backup/backuputils",
5558
"//pkg/base",
5659
"//pkg/ccl",
5760
"//pkg/cloud",
61+
"//pkg/cloud/cloudpb",
5862
"//pkg/cloud/impl:cloudimpl",
5963
"//pkg/jobs/jobspb",
6064
"//pkg/security/securityassets",
6165
"//pkg/security/securitytest",
6266
"//pkg/security/username",
6367
"//pkg/server",
6468
"//pkg/sql",
69+
"//pkg/testutils",
6570
"//pkg/testutils/serverutils",
6671
"//pkg/testutils/testcluster",
72+
"//pkg/util",
6773
"//pkg/util/hlc",
74+
"//pkg/util/ioctx",
6875
"//pkg/util/leaktest",
6976
"//pkg/util/log",
7077
"//pkg/util/protoutil",
7178
"//pkg/util/randutil",
79+
"@com_github_cockroachdb_errors//:errors",
7280
"@com_github_stretchr_testify//require",
7381
],
7482
)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package backupdest
7+
8+
import (
9+
"bytes"
10+
"context"
11+
"fmt"
12+
"strings"
13+
14+
"github.com/cockroachdb/cockroach/pkg/backup/backupbase"
15+
"github.com/cockroachdb/cockroach/pkg/backup/backuppb"
16+
"github.com/cockroachdb/cockroach/pkg/backup/backuputils"
17+
"github.com/cockroachdb/cockroach/pkg/cloud"
18+
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
19+
"github.com/cockroachdb/cockroach/pkg/security/username"
20+
"github.com/cockroachdb/cockroach/pkg/util/hlc"
21+
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
22+
"github.com/cockroachdb/cockroach/pkg/util/tracing"
23+
"github.com/cockroachdb/errors"
24+
)
25+
26+
// WriteBackupIndexMetadata writes an index file for the backup described by the
27+
// job details. The provided ExternalStorage needs to be rooted at the specific
28+
// directory that the index file should be written to.
29+
//
30+
// Note: This file is not encrypted, so it should not contain any sensitive
31+
// information.
32+
func WriteBackupIndexMetadata(
33+
ctx context.Context,
34+
user username.SQLUsername,
35+
makeExternalStorageFromURI cloud.ExternalStorageFromURIFactory,
36+
details jobspb.BackupDetails,
37+
) error {
38+
ctx, sp := tracing.ChildSpan(ctx, "backupdest.WriteBackupIndexMetadata")
39+
defer sp.Finish()
40+
41+
if details.EndTime.IsEmpty() {
42+
return errors.AssertionFailedf("end time must be set in backup details")
43+
}
44+
if details.Destination.Exists && details.StartTime.IsEmpty() {
45+
return errors.AssertionFailedf("incremental backup details missing a start time")
46+
}
47+
48+
var backupCollectionURI string
49+
// Find the root of the collection URI that the backup is being written to so
50+
// that we can determine the relative path of the backup.
51+
if details.StartTime.IsEmpty() {
52+
backupCollectionURI = details.CollectionURI
53+
} else {
54+
var err error
55+
backupCollectionURI, err = ResolveDefaultBaseIncrementalStorageLocation(
56+
details.Destination.To, details.Destination.IncrementalStorage,
57+
)
58+
if err != nil {
59+
return errors.Wrapf(err, "get incremental backup collection URI")
60+
}
61+
}
62+
63+
path, err := backuputils.RelativeBackupPathInCollectionURI(backupCollectionURI, details.URI)
64+
if err != nil {
65+
return errors.Wrapf(err, "get relative backup path")
66+
}
67+
metadata := &backuppb.BackupIndexMetadata{
68+
StartTime: details.StartTime,
69+
EndTime: details.EndTime,
70+
Path: path,
71+
}
72+
metadataBytes, err := protoutil.Marshal(metadata)
73+
if err != nil {
74+
return errors.Wrapf(err, "marshal backup index metadata")
75+
}
76+
77+
indexStore, err := makeExternalStorageFromURI(
78+
ctx, details.CollectionURI, user,
79+
)
80+
if err != nil {
81+
return errors.Wrapf(err, "creating external storage")
82+
}
83+
84+
indexFilePath, err := getBackupIndexFilePath(
85+
details.Destination.Subdir,
86+
details.StartTime,
87+
details.EndTime,
88+
)
89+
if err != nil {
90+
return errors.Wrapf(err, "getting index file path")
91+
}
92+
93+
return cloud.WriteFile(
94+
ctx, indexStore, indexFilePath, bytes.NewReader(metadataBytes),
95+
)
96+
}
97+
98+
// getBackupIndexFilePath returns the path to the backup index file representing
99+
// a backup that starts and ends at the given timestamps, including
100+
// the filename and extension. The path is relative to the collection URI.
101+
func getBackupIndexFilePath(subdir string, startTime, endTime hlc.Timestamp) (string, error) {
102+
if strings.EqualFold(subdir, backupbase.LatestFileName) {
103+
return "", errors.AssertionFailedf("expected subdir to be resolved and not be 'LATEST'")
104+
}
105+
// We flatten the subdir so that when listing from the index, we can list with
106+
// the `index/` prefix and delimit on `/`.
107+
flattenedSubdir := strings.ReplaceAll(
108+
strings.TrimPrefix(subdir, "/"),
109+
"/", "-",
110+
)
111+
return backuputils.JoinURLPath(
112+
backupbase.BackupIndexDirectoryPath,
113+
flattenedSubdir,
114+
getBackupIndexFileName(startTime, endTime),
115+
), nil
116+
}
117+
118+
// getBackupIndexFilename generates the filename (including the extension) for a
119+
// backup index file that represents a backup that starts ad ends at the given
120+
// timestamps.
121+
func getBackupIndexFileName(startTime, endTime hlc.Timestamp) string {
122+
descEndTs := backuputils.EncodeDescendingTS(endTime.GoTime())
123+
formattedStartTime := startTime.GoTime().Format(backupbase.BackupIndexFilenameTimestampFormat)
124+
if startTime.IsEmpty() {
125+
formattedStartTime = "0" // Use a placeholder for empty start time.
126+
}
127+
formattedEndTime := endTime.GoTime().Format(backupbase.BackupIndexFilenameTimestampFormat)
128+
return fmt.Sprintf(
129+
"%s_%s_%s_metadata.pb",
130+
descEndTs, formattedStartTime, formattedEndTime,
131+
)
132+
}

0 commit comments

Comments
 (0)