Skip to content

Commit debd13e

Browse files
craig[bot]kev-caowenyihu6
committed
150384: backup: create backup index file on backup completion r=dt a=kev-cao This commit teaches backup to write a backup index file upon completion of the backup as outlined in the [design doc](https://docs.google.com/document/d/1zXGQ7c9jvKmatO64jVKNrO5BBODAE38y-TqNTeb81yA). Epic: CRDB-47942 Fixes: #150326 Release note: None 151054: util: create MapE utility function r=dt a=kev-cao Epic: None Release note: None 151055: asim: fix skew distribution r=tbg a=wenyihu6 **roachprod: ibm delete clusters** Starting in #150873, the IBM API query to list instances for deletion got more complex and this made the previously working shorthand `tags:"k1:v1 AND k2:v2"` ineffective. This patches the query to the fully written format `(tags:"k1:v1" AND tags:"k2:v2")`. Epic: none Release note: None --- **Merge #151319** 151319: roachprod: ibm delete clusters r=herkolategan a=golgeek Starting in #150873, the IBM API query to list instances for deletion got more complex and this made the previously working shorthand `tags:"k1:v1 AND k2:v2"` ineffective. This patches the query to the fully written format `(tags:"k1:v1" AND tags:"k2:v2")`. Epic: none Release note: None Co-authored-by: Ludovic Leroux <[email protected]> --- **asim: fix skew distribution** Previously, skewedDistribution did not correctly compute and return the ratio based distribution. This commit fixes the logic to return the ratio. Epic: none Release note: none --- **asim: add TestEvenDistribution** --- **asim: rename arguments** This commit renames parameters for distribution helper functions in pkg/kv/kvserver/asim/state/new_state.go. --- **asim/state: use epsilon in cumulative weights checking** Previously, we asserted that cumulative weights in the weighted random distribution must sum to exactly 1. Due to floating point precision limitations, this is not always achievable and can lead to false failures like total cumulative weights for all stores should sum up to 1 but got 1.00. This commit relaxes the check to allow a small epsilon tolerance in the comparison to avoid such errors. --- **asim: improve skewedDistribution and its testing** Previously, we fixed the skewedDistribution function by generating weights that decrease by a factor of 1/2 for each subsequent store, and then normalizing them so that they sum up to 1. The result represented a skewed replica weight distribution across stores. However, this required two passes: one to generate and sum the weights, and another to normalize them. This commit improves the logic by using the finite sum of a geometric series to pre-compute the total, allowing normalization in a single pass. It also improves test coverage by adding an echotest that asserts the expected output of the helper functions. Co-authored-by: Kevin Cao <[email protected]> Co-authored-by: wenyihu6 <[email protected]>
4 parents 50fb4c1 + 17fae06 + 3ed6d21 + cb85168 commit debd13e

32 files changed

+897
-129
lines changed

pkg/backup/backup_job.go

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -126,19 +126,17 @@ func filterSpans(includes []roachpb.Span, excludes []roachpb.Span) []roachpb.Spa
126126
func backup(
127127
ctx context.Context,
128128
execCtx sql.JobExecContext,
129-
defaultURI string,
130-
urisByLocalityKV map[string]string,
129+
details jobspb.BackupDetails,
131130
settings *cluster.Settings,
132131
defaultStore cloud.ExternalStorage,
133132
storageByLocalityKV map[string]*cloudpb.ExternalStorage,
134133
resumer *backupResumer,
135134
backupManifest *backuppb.BackupManifest,
136135
makeExternalStorage cloud.ExternalStorageFactory,
137-
encryption *jobspb.BackupEncryptionOptions,
138-
execLocality roachpb.Locality,
139136
) (_ roachpb.RowCount, numBackupInstances int, _ error) {
140137
resumerSpan := tracing.SpanFromContext(ctx)
141138
var lastCheckpoint time.Time
139+
encryption := details.EncryptionOptions
142140

143141
kmsEnv := backupencryption.MakeBackupKMSEnv(
144142
execCtx.ExecCfg().Settings,
@@ -170,14 +168,16 @@ func backup(
170168
oracle := physicalplan.DefaultReplicaChooser
171169
if useBulkOracle.Get(&evalCtx.Settings.SV) {
172170
oracle = kvfollowerreadsccl.NewBulkOracle(
173-
dsp.ReplicaOracleConfig(evalCtx.Locality), execLocality, kvfollowerreadsccl.StreakConfig{},
171+
dsp.ReplicaOracleConfig(evalCtx.Locality),
172+
details.ExecutionLocality,
173+
kvfollowerreadsccl.StreakConfig{},
174174
)
175175
}
176176

177177
// We don't return the compatible nodes here since PartitionSpans will
178178
// filter out incompatible nodes.
179179
planCtx, _, err := dsp.SetupAllNodesPlanningWithOracle(
180-
ctx, evalCtx, execCtx.ExecCfg(), oracle, execLocality,
180+
ctx, evalCtx, execCtx.ExecCfg(), oracle, details.ExecutionLocality,
181181
)
182182
if err != nil {
183183
return roachpb.RowCount{}, 0, errors.Wrap(err, "failed to determine nodes on which to run")
@@ -193,8 +193,8 @@ func backup(
193193
spans,
194194
introducedSpans,
195195
pkIDs,
196-
defaultURI,
197-
urisByLocalityKV,
196+
details.URI,
197+
details.URIsByLocalityKV,
198198
encryption,
199199
&kmsEnv,
200200
kvpb.MVCCFilter(backupManifest.MVCCFilter),
@@ -302,7 +302,7 @@ func backup(
302302
})
303303

304304
err := backupinfo.WriteBackupManifestCheckpoint(
305-
ctx, defaultURI, encryption, &kmsEnv, backupManifest, execCtx.ExecCfg(), execCtx.User(),
305+
ctx, details.URI, encryption, &kmsEnv, backupManifest, execCtx.ExecCfg(), execCtx.User(),
306306
)
307307
if err != nil {
308308
log.Errorf(ctx, "unable to checkpoint backup descriptor: %+v", err)
@@ -439,6 +439,15 @@ func backup(
439439
return roachpb.RowCount{}, 0, err
440440
}
441441

442+
if err := backupdest.WriteBackupIndexMetadata(
443+
ctx,
444+
execCtx.User(),
445+
execCtx.ExecCfg().DistSQLSrv.ExternalStorageFromURI,
446+
details,
447+
); err != nil {
448+
return roachpb.RowCount{}, 0, errors.Wrapf(err, "writing backup index metadata")
449+
}
450+
442451
return backupManifest.EntryCounts, numBackupInstances, nil
443452
}
444453

@@ -723,16 +732,13 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error {
723732
res, numBackupInstances, err = backup(
724733
ctx,
725734
p,
726-
details.URI,
727-
details.URIsByLocalityKV,
735+
details,
728736
p.ExecCfg().Settings,
729737
defaultStore,
730738
storageByLocalityKV,
731739
b,
732740
backupManifest,
733741
p.ExecCfg().DistSQLSrv.ExternalStorage,
734-
details.EncryptionOptions,
735-
details.ExecutionLocality,
736742
)
737743
if err == nil {
738744
break

pkg/backup/backup_test.go

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1708,6 +1708,10 @@ func TestBackupRestoreResume(t *testing.T) {
17081708
defer leaktest.AfterTest(t)()
17091709
defer log.Scope(t).Close(t)
17101710

1711+
// This test flakes in TC CI, but not under Engflow, so we're just skipping it
1712+
// under race.
1713+
skip.UnderRace(t, "flaky under race after #150384")
1714+
17111715
ctx := context.Background()
17121716

17131717
params := base.TestClusterArgs{ServerArgs: base.TestServerArgs{
@@ -1757,11 +1761,13 @@ func TestBackupRestoreResume(t *testing.T) {
17571761
if err := os.WriteFile(checkpointFile, mockManifest, 0644); err != nil {
17581762
t.Fatal(err)
17591763
}
1764+
uri := "nodelocal://1/backup" + "-" + item.testName
17601765
createAndWaitForJob(
17611766
t, sqlDB, []descpb.ID{backupTableDesc.GetID()},
17621767
jobspb.BackupDetails{
1763-
EndTime: srv.Clock().Now(),
1764-
URI: "nodelocal://1/backup" + "-" + item.testName,
1768+
EndTime: srv.Clock().Now(),
1769+
CollectionURI: uri,
1770+
URI: uri,
17651771
},
17661772
jobspb.BackupProgress{},
17671773
roachpb.Version{},
@@ -11271,3 +11277,65 @@ func TestRestoreFailureDeletesComments(t *testing.T) {
1127111277
sqlDB.QueryRow(t, commentCountQuery).Scan(&count)
1127211278
require.Equal(t, 0, count)
1127311279
}
11280+
11281+
func TestBackupIndexCreatedAfterBackup(t *testing.T) {
11282+
defer leaktest.AfterTest(t)()
11283+
defer log.Scope(t).Close(t)
11284+
11285+
ctx := context.Background()
11286+
th, cleanup := newTestHelper(t)
11287+
defer cleanup()
11288+
11289+
th.setOverrideAsOfClauseKnob(t)
11290+
// Time is set to a time such that no full backup will unexpectedly run as we
11291+
// artificially time travel. This ensures deterministic behavior that is not
11292+
// impacted by when the test runs.
11293+
th.env.SetTime(time.Date(2025, 7, 18, 0, 0, 0, 0, time.UTC))
11294+
11295+
th.sqlDB.Exec(t, "SET CLUSTER SETTING backup.compaction.threshold = 4")
11296+
th.sqlDB.Exec(t, "SET CLUSTER SETTING backup.compaction.window_size = 3")
11297+
11298+
schedules, err := th.createBackupSchedule(
11299+
t, "CREATE SCHEDULE FOR BACKUP INTO $1 RECURRING '@hourly'", "nodelocal://1/backup",
11300+
)
11301+
require.NoError(t, err)
11302+
require.Equal(t, 2, len(schedules))
11303+
11304+
full, inc := schedules[0], schedules[1]
11305+
if full.IsPaused() {
11306+
full, inc = inc, full
11307+
}
11308+
11309+
th.env.SetTime(full.NextRun().Add(time.Second))
11310+
require.NoError(t, th.executeSchedules())
11311+
th.waitForSuccessfulScheduledJob(t, full.ScheduleID())
11312+
11313+
var backupPath string
11314+
th.sqlDB.QueryRow(t, "SHOW BACKUPS IN 'nodelocal://1/backup'").Scan(&backupPath)
11315+
11316+
for range 3 {
11317+
inc, err = jobs.ScheduledJobDB(th.internalDB()).Load(ctx, th.env, inc.ScheduleID())
11318+
require.NoError(t, err)
11319+
11320+
th.env.SetTime(inc.NextRun().Add(time.Second))
11321+
require.NoError(t, th.executeSchedules())
11322+
th.waitForSuccessfulScheduledJob(t, inc.ScheduleID())
11323+
}
11324+
var compactionJob jobspb.JobID
11325+
require.NoError(
11326+
t, th.sqlDB.DB.QueryRowContext(
11327+
ctx,
11328+
`SELECT job_id FROM [SHOW JOBS] WHERE description ILIKE 'COMPACT%' AND job_type = 'BACKUP'`,
11329+
).Scan(&compactionJob),
11330+
)
11331+
jobutils.WaitForJobToSucceed(t, th.sqlDB, compactionJob)
11332+
11333+
fullIndexes, err := os.ReadDir(path.Join(th.iodir, "backup", backupbase.BackupIndexDirectoryPath))
11334+
require.NoError(t, err)
11335+
require.Len(t, fullIndexes, 1)
11336+
files, err := os.ReadDir(
11337+
path.Join(th.iodir, "backup", backupbase.BackupIndexDirectoryPath, fullIndexes[0].Name()),
11338+
)
11339+
require.NoError(t, err)
11340+
require.Len(t, files, 5)
11341+
}

pkg/backup/backupbase/constants.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,12 @@ const (
6363
// and groups all the data sst files in each backup, which start with "data/",
6464
// into a single result that can be skipped over quickly.
6565
ListingDelimDataSlash = "data/"
66+
67+
// BackupIndexDirectoryName is the path from the root of the backup collection
68+
// to the directory containing the index files for the backup collection.
69+
BackupIndexDirectoryPath = "index/"
70+
71+
// BackupIndexFilenameTimestampFormat is the format used for the human
72+
// readable start and end times in the index file names.
73+
BackupIndexFilenameTimestampFormat = "20060102-150405.00"
6674
)

pkg/backup/backupdest/BUILD.bazel

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go_library(
44
name = "backupdest",
55
srcs = [
66
"backup_destination.go",
7+
"backup_index.go",
78
"incrementals.go",
89
],
910
importpath = "github.com/cockroachdb/cockroach/pkg/backup/backupdest",
@@ -29,6 +30,7 @@ go_library(
2930
"//pkg/util/hlc",
3031
"//pkg/util/ioctx",
3132
"//pkg/util/mon",
33+
"//pkg/util/protoutil",
3234
"//pkg/util/timeutil",
3335
"//pkg/util/tracing",
3436
"@com_github_cockroachdb_errors//:errors",
@@ -39,36 +41,42 @@ go_test(
3941
name = "backupdest_test",
4042
srcs = [
4143
"backup_destination_test.go",
44+
"backup_index_test.go",
4245
"incrementals_test.go",
4346
"main_test.go",
4447
],
48+
embed = [":backupdest"],
4549
exec_properties = select({
4650
"//build/toolchains:is_heavy": {"test.Pool": "large"},
4751
"//conditions:default": {"test.Pool": "default"},
4852
}),
4953
deps = [
50-
":backupdest",
5154
"//pkg/backup/backupbase",
5255
"//pkg/backup/backuppb",
5356
"//pkg/backup/backuptestutils",
5457
"//pkg/backup/backuputils",
5558
"//pkg/base",
5659
"//pkg/ccl",
5760
"//pkg/cloud",
61+
"//pkg/cloud/cloudpb",
5862
"//pkg/cloud/impl:cloudimpl",
5963
"//pkg/jobs/jobspb",
6064
"//pkg/security/securityassets",
6165
"//pkg/security/securitytest",
6266
"//pkg/security/username",
6367
"//pkg/server",
6468
"//pkg/sql",
69+
"//pkg/testutils",
6570
"//pkg/testutils/serverutils",
6671
"//pkg/testutils/testcluster",
72+
"//pkg/util",
6773
"//pkg/util/hlc",
74+
"//pkg/util/ioctx",
6875
"//pkg/util/leaktest",
6976
"//pkg/util/log",
7077
"//pkg/util/protoutil",
7178
"//pkg/util/randutil",
79+
"@com_github_cockroachdb_errors//:errors",
7280
"@com_github_stretchr_testify//require",
7381
],
7482
)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package backupdest
7+
8+
import (
9+
"bytes"
10+
"context"
11+
"fmt"
12+
"strings"
13+
14+
"github.com/cockroachdb/cockroach/pkg/backup/backupbase"
15+
"github.com/cockroachdb/cockroach/pkg/backup/backuppb"
16+
"github.com/cockroachdb/cockroach/pkg/backup/backuputils"
17+
"github.com/cockroachdb/cockroach/pkg/cloud"
18+
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
19+
"github.com/cockroachdb/cockroach/pkg/security/username"
20+
"github.com/cockroachdb/cockroach/pkg/util/hlc"
21+
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
22+
"github.com/cockroachdb/cockroach/pkg/util/tracing"
23+
"github.com/cockroachdb/errors"
24+
)
25+
26+
// WriteBackupIndexMetadata writes an index file for the backup described by the
27+
// job details. The provided ExternalStorage needs to be rooted at the specific
28+
// directory that the index file should be written to.
29+
//
30+
// Note: This file is not encrypted, so it should not contain any sensitive
31+
// information.
32+
func WriteBackupIndexMetadata(
33+
ctx context.Context,
34+
user username.SQLUsername,
35+
makeExternalStorageFromURI cloud.ExternalStorageFromURIFactory,
36+
details jobspb.BackupDetails,
37+
) error {
38+
ctx, sp := tracing.ChildSpan(ctx, "backupdest.WriteBackupIndexMetadata")
39+
defer sp.Finish()
40+
41+
if details.EndTime.IsEmpty() {
42+
return errors.AssertionFailedf("end time must be set in backup details")
43+
}
44+
if details.Destination.Exists && details.StartTime.IsEmpty() {
45+
return errors.AssertionFailedf("incremental backup details missing a start time")
46+
}
47+
48+
var backupCollectionURI string
49+
// Find the root of the collection URI that the backup is being written to so
50+
// that we can determine the relative path of the backup.
51+
if details.StartTime.IsEmpty() {
52+
backupCollectionURI = details.CollectionURI
53+
} else {
54+
var err error
55+
backupCollectionURI, err = ResolveDefaultBaseIncrementalStorageLocation(
56+
details.Destination.To, details.Destination.IncrementalStorage,
57+
)
58+
if err != nil {
59+
return errors.Wrapf(err, "get incremental backup collection URI")
60+
}
61+
}
62+
63+
path, err := backuputils.RelativeBackupPathInCollectionURI(backupCollectionURI, details.URI)
64+
if err != nil {
65+
return errors.Wrapf(err, "get relative backup path")
66+
}
67+
metadata := &backuppb.BackupIndexMetadata{
68+
StartTime: details.StartTime,
69+
EndTime: details.EndTime,
70+
Path: path,
71+
}
72+
metadataBytes, err := protoutil.Marshal(metadata)
73+
if err != nil {
74+
return errors.Wrapf(err, "marshal backup index metadata")
75+
}
76+
77+
indexStore, err := makeExternalStorageFromURI(
78+
ctx, details.CollectionURI, user,
79+
)
80+
if err != nil {
81+
return errors.Wrapf(err, "creating external storage")
82+
}
83+
84+
indexFilePath, err := getBackupIndexFilePath(
85+
details.Destination.Subdir,
86+
details.StartTime,
87+
details.EndTime,
88+
)
89+
if err != nil {
90+
return errors.Wrapf(err, "getting index file path")
91+
}
92+
93+
return cloud.WriteFile(
94+
ctx, indexStore, indexFilePath, bytes.NewReader(metadataBytes),
95+
)
96+
}
97+
98+
// getBackupIndexFilePath returns the path to the backup index file representing
99+
// a backup that starts and ends at the given timestamps, including
100+
// the filename and extension. The path is relative to the collection URI.
101+
func getBackupIndexFilePath(subdir string, startTime, endTime hlc.Timestamp) (string, error) {
102+
if strings.EqualFold(subdir, backupbase.LatestFileName) {
103+
return "", errors.AssertionFailedf("expected subdir to be resolved and not be 'LATEST'")
104+
}
105+
// We flatten the subdir so that when listing from the index, we can list with
106+
// the `index/` prefix and delimit on `/`.
107+
flattenedSubdir := strings.ReplaceAll(
108+
strings.TrimPrefix(subdir, "/"),
109+
"/", "-",
110+
)
111+
return backuputils.JoinURLPath(
112+
backupbase.BackupIndexDirectoryPath,
113+
flattenedSubdir,
114+
getBackupIndexFileName(startTime, endTime),
115+
), nil
116+
}
117+
118+
// getBackupIndexFilename generates the filename (including the extension) for a
119+
// backup index file that represents a backup that starts ad ends at the given
120+
// timestamps.
121+
func getBackupIndexFileName(startTime, endTime hlc.Timestamp) string {
122+
descEndTs := backuputils.EncodeDescendingTS(endTime.GoTime())
123+
formattedStartTime := startTime.GoTime().Format(backupbase.BackupIndexFilenameTimestampFormat)
124+
if startTime.IsEmpty() {
125+
formattedStartTime = "0" // Use a placeholder for empty start time.
126+
}
127+
formattedEndTime := endTime.GoTime().Format(backupbase.BackupIndexFilenameTimestampFormat)
128+
return fmt.Sprintf(
129+
"%s_%s_%s_metadata.pb",
130+
descEndTs, formattedStartTime, formattedEndTime,
131+
)
132+
}

0 commit comments

Comments
 (0)