Skip to content

Commit cb138ca

Browse files
authored
br: add keyspace-aware GC safepoint support for backup and restore (#65483)
close #65482
1 parent 2c0a134 commit cb138ca

31 files changed

+1449
-294
lines changed

br/pkg/backup/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ go_library(
1616
"//br/pkg/conn",
1717
"//br/pkg/conn/util",
1818
"//br/pkg/errors",
19+
"//br/pkg/gc",
1920
"//br/pkg/glue",
2021
"//br/pkg/logutil",
2122
"//br/pkg/metautil",
@@ -73,6 +74,7 @@ go_test(
7374
shard_count = 18,
7475
deps = [
7576
"//br/pkg/conn",
77+
"//br/pkg/gc",
7678
"//br/pkg/gluetidb/mock",
7779
"//br/pkg/metautil",
7880
"//br/pkg/mock",
@@ -94,6 +96,7 @@ go_test(
9496
"@com_github_stretchr_testify//require",
9597
"@com_github_tikv_client_go_v2//oracle",
9698
"@com_github_tikv_client_go_v2//testutils",
99+
"@com_github_tikv_client_go_v2//tikv",
97100
"@com_github_tikv_pd_client//:client",
98101
"@io_opencensus_go//stats/view",
99102
"@org_golang_google_grpc//:grpc",

br/pkg/backup/client.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/pingcap/tidb/br/pkg/conn"
2323
connutil "github.com/pingcap/tidb/br/pkg/conn/util"
2424
berrors "github.com/pingcap/tidb/br/pkg/errors"
25+
"github.com/pingcap/tidb/br/pkg/gc"
2526
"github.com/pingcap/tidb/br/pkg/glue"
2627
"github.com/pingcap/tidb/br/pkg/logutil"
2728
"github.com/pingcap/tidb/br/pkg/metautil"
@@ -60,6 +61,8 @@ type ClientMgr interface {
6061
GetBackupClient(ctx context.Context, storeID uint64) (backuppb.BackupClient, error)
6162
ResetBackupClient(ctx context.Context, storeID uint64) (backuppb.BackupClient, error)
6263
GetPDClient() pd.Client
64+
GetStorage() kv.Storage
65+
GetGCManager() gc.Manager
6366
GetLockResolver() *txnlock.LockResolver
6467
Close()
6568
}
@@ -487,7 +490,7 @@ func (bc *Client) GetTS(ctx context.Context, duration time.Duration, ts uint64)
487490
}
488491

489492
// check backup time do not exceed GCSafePoint
490-
err = utils.CheckGCSafePoint(ctx, bc.mgr.GetPDClient(), backupTS)
493+
err = gc.CheckGCSafePoint(ctx, bc.mgr.GetGCManager(), backupTS)
491494
if err != nil {
492495
return 0, errors.Trace(err)
493496
}
@@ -508,13 +511,13 @@ func (bc *Client) GetSafePointID() string {
508511
log.Info("reuse the checkpoint gc-safepoint service id", zap.String("service-id", bc.checkpointMeta.GCServiceId))
509512
return bc.checkpointMeta.GCServiceId
510513
}
511-
return utils.MakeSafePointID()
514+
return gc.MakeSafePointID()
512515
}
513516

514517
// SetGCTTL set gcTTL for client.
515518
func (bc *Client) SetGCTTL(ttl int64) {
516519
if ttl <= 0 {
517-
ttl = utils.DefaultBRGCSafePointTTL
520+
ttl = gc.DefaultBRGCSafePointTTL
518521
}
519522
bc.gcTTL = ttl
520523
}

br/pkg/backup/client_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/pingcap/kvproto/pkg/kvrpcpb"
2020
"github.com/pingcap/tidb/br/pkg/backup"
2121
"github.com/pingcap/tidb/br/pkg/conn"
22+
"github.com/pingcap/tidb/br/pkg/gc"
2223
gluemock "github.com/pingcap/tidb/br/pkg/gluetidb/mock"
2324
"github.com/pingcap/tidb/br/pkg/metautil"
2425
"github.com/pingcap/tidb/br/pkg/mock"
@@ -32,6 +33,7 @@ import (
3233
"github.com/stretchr/testify/require"
3334
"github.com/tikv/client-go/v2/oracle"
3435
"github.com/tikv/client-go/v2/testutils"
36+
"github.com/tikv/client-go/v2/tikv"
3537
pd "github.com/tikv/pd/client"
3638
"go.opencensus.io/stats/view"
3739
)
@@ -117,6 +119,8 @@ func createBackupSuite(t *testing.T) *testBackup {
117119
s.mockCluster = mockCluster
118120
s.ctx, s.cancel = context.WithCancel(context.Background())
119121
mockMgr := &conn.Mgr{PdController: &pdutil.PdController{}}
122+
gcMgr := gc.NewManager(s.mockPDClient, tikv.NullspaceID)
123+
mockMgr.SetGcManager(gcMgr)
120124
mockMgr.SetPDClient(s.mockPDClient)
121125
s.backupClient = backup.NewBackupClient(s.ctx, mockMgr)
122126

br/pkg/conn/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ go_library(
99
"//br/pkg/config",
1010
"//br/pkg/conn/util",
1111
"//br/pkg/errors",
12+
"//br/pkg/gc",
1213
"//br/pkg/glue",
1314
"//br/pkg/logutil",
1415
"//br/pkg/pdutil",

br/pkg/conn/conn.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
kvconfig "github.com/pingcap/tidb/br/pkg/config"
2424
"github.com/pingcap/tidb/br/pkg/conn/util"
2525
berrors "github.com/pingcap/tidb/br/pkg/errors"
26+
"github.com/pingcap/tidb/br/pkg/gc"
2627
"github.com/pingcap/tidb/br/pkg/glue"
2728
"github.com/pingcap/tidb/br/pkg/logutil"
2829
"github.com/pingcap/tidb/br/pkg/pdutil"
@@ -75,6 +76,7 @@ type Mgr struct {
7576
storage kv.Storage // Used to access SQL related interfaces.
7677
tikvStore tikv.Storage // Used to access TiKV specific interfaces.
7778
ownsStorage bool
79+
gcManager gc.Manager
7880

7981
*utils.StoreManager
8082
}
@@ -229,12 +231,20 @@ func NewMgr(
229231
}
230232
}
231233

234+
// Extract keyspaceID from storage
235+
keyspaceID := tikv.NullspaceID
236+
if storage != nil {
237+
keyspaceID = storage.GetCodec().GetKeyspaceID()
238+
}
239+
gcManager := gc.NewManager(controller.GetPDClient(), keyspaceID)
240+
232241
mgr := &Mgr{
233242
PdController: controller,
234243
storage: storage,
235244
tikvStore: tikvStorage,
236245
dom: dom,
237246
ownsStorage: g.OwnsStorage(),
247+
gcManager: gcManager,
238248
StoreManager: utils.NewStoreManager(controller.GetPDClient(), keepalive, tlsConf),
239249
}
240250
return mgr, nil
@@ -266,6 +276,15 @@ func (mgr *Mgr) GetStorage() kv.Storage {
266276
return mgr.storage
267277
}
268278

279+
func (mgr *Mgr) GetGCManager() gc.Manager {
280+
return mgr.gcManager
281+
}
282+
283+
// SetGcManager sets the gc manager (for testing purposes).
284+
func (mgr *Mgr) SetGcManager(gcMgr gc.Manager) {
285+
mgr.gcManager = gcMgr
286+
}
287+
269288
// GetTLSConfig returns the tls config.
270289
func (mgr *Mgr) GetTLSConfig() *tls.Config {
271290
return mgr.StoreManager.TLSConfig()

br/pkg/gc/BUILD.bazel

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
2+
3+
go_library(
4+
name = "gc",
5+
srcs = [
6+
"manager.go",
7+
"manager_global.go",
8+
"manager_keyspace.go",
9+
"safepoint.go",
10+
],
11+
importpath = "github.com/pingcap/tidb/br/pkg/gc",
12+
visibility = ["//visibility:public"],
13+
deps = [
14+
"//br/pkg/errors",
15+
"@com_github_google_uuid//:uuid",
16+
"@com_github_pingcap_errors//:errors",
17+
"@com_github_pingcap_failpoint//:failpoint",
18+
"@com_github_pingcap_log//:log",
19+
"@com_github_tikv_client_go_v2//oracle",
20+
"@com_github_tikv_client_go_v2//tikv",
21+
"@com_github_tikv_pd_client//:client",
22+
"@com_github_tikv_pd_client//clients/gc",
23+
"@org_uber_go_zap//:zap",
24+
"@org_uber_go_zap//zapcore",
25+
],
26+
)
27+
28+
go_test(
29+
name = "gc_test",
30+
timeout = "short",
31+
srcs = [
32+
"manager_test.go",
33+
"mock_test.go",
34+
"safepoint_test.go",
35+
],
36+
flaky = True,
37+
shard_count = 7,
38+
deps = [
39+
":gc",
40+
"//pkg/config",
41+
"//pkg/store/mockstore/unistore/lockstore",
42+
"//pkg/store/mockstore/unistore/tikv",
43+
"//pkg/store/mockstore/unistore/tikv/mvcc",
44+
"@com_github_pingcap_badger//:badger",
45+
"@com_github_stretchr_testify//require",
46+
"@com_github_stretchr_testify//suite",
47+
"@com_github_tikv_client_go_v2//tikv",
48+
"@com_github_tikv_pd_client//:client",
49+
"@com_github_tikv_pd_client//clients/gc",
50+
"@org_uber_go_zap//:zap",
51+
"@org_uber_go_zap//zapcore",
52+
"@org_uber_go_zap//zaptest/observer",
53+
],
54+
)

br/pkg/gc/manager.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright 2025 PingCAP, Inc. Licensed under Apache-2.0.
2+
3+
package gc
4+
5+
import (
6+
"context"
7+
8+
"github.com/tikv/client-go/v2/tikv"
9+
pd "github.com/tikv/pd/client"
10+
)
11+
12+
// Manager abstracts GC operations, supporting both global and keyspace-level GC.
13+
type Manager interface {
14+
// GetGCSafePoint returns the current GC safe point.
15+
GetGCSafePoint(ctx context.Context) (uint64, error)
16+
17+
// SetServiceSafePoint sets the service safe point with TTL.
18+
// If TTL <= 0, it removes the service safe point.
19+
SetServiceSafePoint(ctx context.Context, sp BRServiceSafePoint) error
20+
21+
// DeleteServiceSafePoint removes the service safe point.
22+
DeleteServiceSafePoint(ctx context.Context, sp BRServiceSafePoint) error
23+
}
24+
25+
// NewManager creates a GC Manager.
26+
// Pass keyspaceID = tikv.NullspaceID for global mode, or actual keyspaceID for keyspace mode.
27+
func NewManager(pdClient pd.Client, keyspaceID tikv.KeyspaceID) Manager {
28+
if keyspaceID == tikv.NullspaceID {
29+
return newGlobalManager(pdClient)
30+
}
31+
return newKeyspaceManager(pdClient, keyspaceID)
32+
}

br/pkg/gc/manager_global.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// Copyright 2025 PingCAP, Inc. Licensed under Apache-2.0.
2+
3+
package gc
4+
5+
import (
6+
"context"
7+
"os"
8+
"time"
9+
10+
"github.com/pingcap/errors"
11+
"github.com/pingcap/failpoint"
12+
"github.com/pingcap/log"
13+
pd "github.com/tikv/pd/client"
14+
"go.uber.org/zap"
15+
)
16+
17+
// globalManager implements Manager using the global GC safepoint mechanism.
18+
// It uses the deprecated pd.Client.UpdateServiceGCSafePoint API for backward compatibility.
19+
type globalManager struct {
20+
pdClient pd.Client
21+
}
22+
23+
// Ensure globalManager implements Manager interface.
24+
var _ Manager = (*globalManager)(nil)
25+
26+
// newGlobalManager creates a new globalManager instance.
27+
func newGlobalManager(pdClient pd.Client) *globalManager {
28+
return &globalManager{
29+
pdClient: pdClient,
30+
}
31+
}
32+
33+
// GetGCSafePoint returns the current GC safe point.
34+
func (m *globalManager) GetGCSafePoint(ctx context.Context) (uint64, error) {
35+
safePoint, err := m.pdClient.UpdateGCSafePoint(ctx, 0)
36+
if err != nil {
37+
return 0, errors.Trace(err)
38+
}
39+
return safePoint, nil
40+
}
41+
42+
// SetServiceSafePoint sets the global service safe point using the deprecated API.
43+
// This maintains backward compatibility with existing BR behavior.
44+
func (m *globalManager) SetServiceSafePoint(ctx context.Context, sp BRServiceSafePoint) error {
45+
log.Debug("update PD safePoint limit with TTL", zap.Object("safePoint", sp))
46+
47+
lastSafePoint, err := m.pdClient.UpdateServiceGCSafePoint(ctx, sp.ID, sp.TTL, sp.BackupTS-1)
48+
if err == nil {
49+
// Integration tests use this to distinguish global vs keyspace GC protection.
50+
failpoint.Inject("hint-gc-global-set-safepoint", func(v failpoint.Value) {
51+
if sigFile, ok := v.(string); ok {
52+
// Write the service ID so the test can match PD output precisely.
53+
if writeErr := os.WriteFile(sigFile, []byte(sp.ID), 0o644); writeErr != nil {
54+
log.Warn("failed to write failpoint signal file", zap.Error(writeErr), zap.String("file", sigFile))
55+
}
56+
}
57+
// Provide a small observation window for test scripts.
58+
time.Sleep(3 * time.Second)
59+
})
60+
}
61+
if lastSafePoint > sp.BackupTS-1 && sp.TTL > 0 {
62+
log.Warn("service GC safe point lost, we may fail to back up if GC lifetime isn't long enough",
63+
zap.Uint64("lastSafePoint", lastSafePoint),
64+
zap.Object("safePoint", sp),
65+
)
66+
}
67+
return errors.Trace(err)
68+
}
69+
70+
// DeleteServiceSafePoint removes the service safe point by setting TTL to 0.
71+
func (m *globalManager) DeleteServiceSafePoint(ctx context.Context, sp BRServiceSafePoint) error {
72+
// Setting TTL to 0 effectively removes the service safe point
73+
_, err := m.pdClient.UpdateServiceGCSafePoint(ctx, sp.ID, 0, 0)
74+
if err == nil {
75+
failpoint.Inject("hint-gc-global-delete-safepoint", func(v failpoint.Value) {
76+
if sigFile, ok := v.(string); ok {
77+
if writeErr := os.WriteFile(sigFile, []byte(sp.ID), 0o644); writeErr != nil {
78+
log.Warn("failed to write failpoint signal file", zap.Error(writeErr), zap.String("file", sigFile))
79+
}
80+
}
81+
})
82+
}
83+
return errors.Trace(err)
84+
}

0 commit comments

Comments
 (0)