Skip to content

Commit 5b8f942

Browse files
craig[bot]arulajmanirafiss
committed
149620: kvserver: do not write truncated state during evaluation r=pav-kv a=arulajmani This moves writing the truncated state from evaluation to splitPreApply, which is run locally on each store before the write batch constructed as part of the split is applied. This means we no longer replicate state that nominally belongs to the raft engine. Closes #144540 Release note: None 149695: catalog: make DescriptorType unredactable r=rafiss a=rafiss fixes #149396 Release note: None Co-authored-by: Arul Ajmani <[email protected]> Co-authored-by: Rafi Shamim <[email protected]>
3 parents 70acc70 + 47af090 + 24c5b2e commit 5b8f942

File tree

11 files changed

+125
-14
lines changed

11 files changed

+125
-14
lines changed

docs/generated/settings/settings-for-tenants.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,4 +415,4 @@ trace.zipkin.collector string the address of a Zipkin instance to receive trace
415415
ui.database_locality_metadata.enabled boolean true if enabled shows extended locality data about databases and tables in DB Console which can be expensive to compute application
416416
ui.default_timezone string the default timezone used to format timestamps in the ui application
417417
ui.display_timezone enumeration etc/utc the timezone used to format timestamps in the ui. This setting is deprecatedand will be removed in a future version. Use the 'ui.default_timezone' setting instead. 'ui.default_timezone' takes precedence over this setting. [etc/utc = 0, america/new_york = 1] application
418-
version version 1000025.2-upgrading-to-1000025.3-step-008 set the active cluster version in the format '<major>.<minor>' application
418+
version version 1000025.2-upgrading-to-1000025.3-step-010 set the active cluster version in the format '<major>.<minor>' application

docs/generated/settings/settings.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,6 @@
373373
<tr><td><div id="setting-ui-database-locality-metadata-enabled" class="anchored"><code>ui.database_locality_metadata.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if enabled shows extended locality data about databases and tables in DB Console which can be expensive to compute</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
374374
<tr><td><div id="setting-ui-default-timezone" class="anchored"><code>ui.default_timezone</code></div></td><td>string</td><td><code></code></td><td>the default timezone used to format timestamps in the ui</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
375375
<tr><td><div id="setting-ui-display-timezone" class="anchored"><code>ui.display_timezone</code></div></td><td>enumeration</td><td><code>etc/utc</code></td><td>the timezone used to format timestamps in the ui. This setting is deprecatedand will be removed in a future version. Use the &#39;ui.default_timezone&#39; setting instead. &#39;ui.default_timezone&#39; takes precedence over this setting. [etc/utc = 0, america/new_york = 1]</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
376-
<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000025.2-upgrading-to-1000025.3-step-008</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
376+
<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000025.2-upgrading-to-1000025.3-step-010</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
377377
</tbody>
378378
</table>

pkg/clusterversion/cockroach_versions.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,13 @@ const (
219219
V25_3_AddEstimatedLastLoginTime
220220

221221
V25_3_AddHotRangeLoggerJob
222+
223+
// V25_3_WriteInitialTruncStateBeforeSplitApplication is the version above
224+
// which we write the initial truncated state before applying a split. By
225+
// extension, we no longer need to replicate the truncated state when
226+
// constructing the split write batch.
227+
V25_3_WriteInitialTruncStateBeforeSplitApplication
228+
222229
// *************************************************
223230
// Step (1) Add new versions above this comment.
224231
// Do not add new versions to a patch release.
@@ -279,6 +286,8 @@ var versionTable = [numKeys]roachpb.Version{
279286

280287
V25_3_AddHotRangeLoggerJob: {Major: 25, Minor: 2, Internal: 8},
281288

289+
V25_3_WriteInitialTruncStateBeforeSplitApplication: {Major: 25, Minor: 2, Internal: 10},
290+
282291
// *************************************************
283292
// Step (2): Add new versions above this comment.
284293
// Do not add new versions to a patch release.

pkg/kv/kvserver/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,7 @@ go_test(
579579
"@org_golang_google_grpc//:grpc",
580580
"@org_golang_google_grpc//metadata",
581581
"@org_golang_x_exp//maps",
582+
"@org_golang_x_exp//slices",
582583
"@org_golang_x_sync//errgroup",
583584
"@org_golang_x_sync//syncmap",
584585
],

pkg/kv/kvserver/batcheval/cmd_end_transaction.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1481,9 +1481,15 @@ func splitTriggerHelper(
14811481
); err != nil {
14821482
return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to write initial Replica state")
14831483
}
1484-
// TODO(arulajmani): remove WriteInitialTruncState.
1485-
if err := stateloader.WriteInitialTruncState(ctx, batch, split.RightDesc.RangeID); err != nil {
1486-
return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to write initial Replica state")
1484+
// TODO(arulajmani): This can be removed once all nodes are past the
1485+
// V25_3_WriteInitialTruncStateBeforeSplitApplication cluster version.
1486+
// At that point, we'll no longer need to replicate the truncated state
1487+
// as all replicas will be responsible for writing it locally before
1488+
// applying the split.
1489+
if !rec.ClusterSettings().Version.IsActive(ctx, clusterversion.V25_3_WriteInitialTruncStateBeforeSplitApplication) {
1490+
if err := stateloader.WriteInitialTruncState(ctx, batch, split.RightDesc.RangeID); err != nil {
1491+
return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to write initial Replica state")
1492+
}
14871493
}
14881494
}
14891495

pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2232,13 +2232,14 @@ func TestSplitTriggerWritesInitialReplicaState(t *testing.T) {
22322232
require.NoError(t, err)
22332233
require.NotNil(t, loadedGCHint)
22342234
require.Equal(t, gcHint, *loadedGCHint)
2235-
expTruncState := kvserverpb.RaftTruncatedState{
2236-
Term: stateloader.RaftInitialLogTerm,
2237-
Index: stateloader.RaftInitialLogIndex,
2238-
}
2235+
2236+
// The split trigger doesn't write the initial truncated state for the RHS
2237+
// as it isn't part of the range's applied state.
2238+
expTruncState := kvserverpb.RaftTruncatedState{}
22392239
loadedTruncState, err := slRight.LoadRaftTruncatedState(ctx, batch)
22402240
require.NoError(t, err)
22412241
require.Equal(t, expTruncState, loadedTruncState)
2242+
22422243
loadedVersion, err := slRight.LoadVersion(ctx, batch)
22432244
require.NoError(t, err)
22442245
require.Equal(t, version, loadedVersion)

pkg/kv/kvserver/stateloader/initial.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,10 @@ func WriteInitialReplicaState(
107107
return newMS, nil
108108
}
109109

110-
// WriteInitialTruncState writes the initial RaftTruncatedState.
111-
// TODO(arulajmani): remove this.
110+
// WriteInitialTruncState writes the initial truncated state.
111+
//
112+
// TODO(arul): this can be removed once no longer call this from the split
113+
// evaluation path.
112114
func WriteInitialTruncState(ctx context.Context, w storage.Writer, rangeID roachpb.RangeID) error {
113115
return logstore.NewStateLoader(rangeID).SetRaftTruncatedState(ctx, w,
114116
&kvserverpb.RaftTruncatedState{

pkg/kv/kvserver/store_split.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"bytes"
1010
"context"
1111

12+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1213
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage"
1314
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/load"
1415
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
@@ -131,13 +132,24 @@ func splitPreApply(
131132
}
132133

133134
// The RHS replica exists and is uninitialized. We are initializing it here.
134-
// Update the raft HardState with the new Commit index (taken from the applied
135-
// state in the write batch), and use existing or default Term and Vote. This
136-
// is the common case.
135+
// This is the common case.
136+
//
137+
// Update the raft HardState with the new Commit index (taken from the
138+
// applied state in the write batch), and use existing[*] or default Term
139+
// and Vote. Also write the initial RaftTruncatedState.
140+
//
141+
// [*] Note that uninitialized replicas may cast votes, and if they have, we
142+
// can't load the default Term and Vote values.
137143
rsl := stateloader.Make(split.RightDesc.RangeID)
138144
if err := rsl.SynthesizeRaftState(ctx, readWriter); err != nil {
139145
log.Fatalf(ctx, "%v", err)
140146
}
147+
if err := rsl.SetRaftTruncatedState(ctx, readWriter, &kvserverpb.RaftTruncatedState{
148+
Index: stateloader.RaftInitialLogIndex,
149+
Term: stateloader.RaftInitialLogTerm,
150+
}); err != nil {
151+
log.Fatalf(ctx, "%v", err)
152+
}
141153
// Persist the closed timestamp.
142154
//
143155
// In order to tolerate a nil initClosedTS input, let's forward to

pkg/kv/kvserver/store_test.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ import (
7272
"github.com/cockroachdb/redact"
7373
"github.com/kr/pretty"
7474
"github.com/stretchr/testify/require"
75+
"golang.org/x/exp/slices"
7576
"golang.org/x/sync/errgroup"
7677
)
7778

@@ -4128,6 +4129,77 @@ func TestStoreGetOrCreateReplicaWritesRaftReplicaID(t *testing.T) {
41284129
require.Equal(t, kvserverpb.RaftReplicaID{ReplicaID: 7}, replicaID)
41294130
}
41304131

4132+
// TestSplitPreApplyInitializesTruncatedState ensures that the Raft truncated
4133+
// state for the RHS is correctly initialized when calling splitPreApply.
4134+
func TestSplitPreApplyInitializesTruncatedState(t *testing.T) {
4135+
defer leaktest.AfterTest(t)()
4136+
defer log.Scope(t).Close(t)
4137+
4138+
ctx := context.Background()
4139+
manual := timeutil.NewManualTime(timeutil.Unix(0, 10))
4140+
clock := hlc.NewClockForTesting(manual)
4141+
4142+
db := storage.NewDefaultInMemForTesting()
4143+
defer db.Close()
4144+
batch := db.NewBatch()
4145+
defer batch.Close()
4146+
4147+
startKey := roachpb.Key("0000")
4148+
endKey := roachpb.Key("9999")
4149+
desc := roachpb.RangeDescriptor{
4150+
RangeID: 99,
4151+
StartKey: roachpb.RKey(startKey),
4152+
EndKey: roachpb.RKey(endKey),
4153+
}
4154+
desc.AddReplica(1, 1, roachpb.VOTER_FULL)
4155+
4156+
sl := stateloader.Make(desc.RangeID)
4157+
// Write the range state that will be consulted and copied during the split.
4158+
lease := roachpb.Lease{
4159+
Replica: desc.InternalReplicas[0],
4160+
Term: 10,
4161+
MinExpiration: hlc.Timestamp{WallTime: 100},
4162+
}
4163+
err := sl.SetLease(ctx, batch, nil, lease)
4164+
require.NoError(t, err)
4165+
4166+
// Set up the store and LHS replica.
4167+
cfg := TestStoreConfig(clock)
4168+
stopper := stop.NewStopper()
4169+
defer stopper.Stop(ctx)
4170+
store := createTestStoreWithConfig(ctx, t, stopper, testStoreOpts{}, &cfg)
4171+
lhsRepl := createReplica(store, desc.RangeID, desc.StartKey, desc.EndKey)
4172+
4173+
// Construct the split trigger.
4174+
splitKey := roachpb.RKey("5555")
4175+
leftDesc, rightDesc := desc, desc
4176+
leftDesc.EndKey = splitKey
4177+
rightDesc.RangeID++
4178+
rightDesc.StartKey = splitKey
4179+
rightDesc.InternalReplicas = slices.Clone(leftDesc.InternalReplicas)
4180+
rightDesc.InternalReplicas[0].ReplicaID++
4181+
4182+
// Create an uninitialized replica for the RHS. splitPreApply expects this.
4183+
_, _, err = store.getOrCreateReplica(
4184+
ctx, rightDesc.RangeID, rightDesc.InternalReplicas[0].ReplicaID, &rightDesc.InternalReplicas[0],
4185+
)
4186+
require.NoError(t, err)
4187+
4188+
split := roachpb.SplitTrigger{
4189+
LeftDesc: leftDesc,
4190+
RightDesc: rightDesc,
4191+
}
4192+
4193+
splitPreApply(ctx, lhsRepl, batch, split, nil)
4194+
4195+
// Verify that the RHS truncated state is initialized as expected.
4196+
rsl := stateloader.Make(rightDesc.RangeID)
4197+
truncState, err := rsl.LoadRaftTruncatedState(ctx, batch)
4198+
require.NoError(t, err)
4199+
require.Equal(t, stateloader.RaftInitialLogIndex, int(truncState.Index))
4200+
require.Equal(t, stateloader.RaftInitialLogTerm, int(truncState.Term))
4201+
}
4202+
41314203
func BenchmarkStoreGetReplica(b *testing.B) {
41324204
ctx := context.Background()
41334205
stopper := stop.NewStopper()

pkg/sql/catalog/descriptor.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ import (
3131
// DescriptorType is a symbol representing the (sub)type of a descriptor.
3232
type DescriptorType string
3333

34+
var _ redact.SafeValue = DescriptorType("")
35+
36+
// SafeValue implements redact.SafeValue.
37+
func (DescriptorType) SafeValue() {}
38+
3439
const (
3540
// Any represents any descriptor.
3641
Any DescriptorType = "any"

0 commit comments

Comments
 (0)