Skip to content

Commit c15e86a

Browse files
craig[bot]knz
andcommitted
106396: server: de-flake a decommission test race condition r=abarganier a=knz Informs cockroachdb#103698 (will fix when backported to 23.1). Epic: CRDB-28893 When a Decommission request is sent that addresses multiple nodes simultaneously, a race condition existed in the code that logs the decommission event to the event log. This is because the `sql.InsertEventRecords` API expects to take ownership over the events. The `Decommission` handler was violating the expectation by passing the same event references to multiple subsequent calls. This was not visible in practice however, because the racy writes were always overwriting the same value to the same field. This patch fixes it by allocating a new event for each subsequent node decommission. Release note: None Co-authored-by: Raphael 'kena' Poss <[email protected]>
2 parents b732d8f + 8ea6f87 commit c15e86a

File tree

2 files changed

+26
-18
lines changed

2 files changed

+26
-18
lines changed

pkg/server/decommission.go

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -347,25 +347,26 @@ func (s *Server) Decommission(
347347
nodeIDs = orderedNodeIDs
348348
}
349349

350-
var event logpb.EventPayload
351-
var nodeDetails *eventpb.CommonNodeDecommissionDetails
352-
if targetStatus.Decommissioning() {
353-
ev := &eventpb.NodeDecommissioning{}
354-
nodeDetails = &ev.CommonNodeDecommissionDetails
355-
event = ev
356-
} else if targetStatus.Decommissioned() {
357-
ev := &eventpb.NodeDecommissioned{}
358-
nodeDetails = &ev.CommonNodeDecommissionDetails
359-
event = ev
360-
} else if targetStatus.Active() {
361-
ev := &eventpb.NodeRecommissioned{}
362-
nodeDetails = &ev.CommonNodeDecommissionDetails
363-
event = ev
364-
} else {
365-
panic("unexpected target membership status")
350+
newEvent := func() (event logpb.EventPayload, nodeDetails *eventpb.CommonNodeDecommissionDetails) {
351+
if targetStatus.Decommissioning() {
352+
ev := &eventpb.NodeDecommissioning{}
353+
nodeDetails = &ev.CommonNodeDecommissionDetails
354+
event = ev
355+
} else if targetStatus.Decommissioned() {
356+
ev := &eventpb.NodeDecommissioned{}
357+
nodeDetails = &ev.CommonNodeDecommissionDetails
358+
event = ev
359+
} else if targetStatus.Active() {
360+
ev := &eventpb.NodeRecommissioned{}
361+
nodeDetails = &ev.CommonNodeDecommissionDetails
362+
event = ev
363+
} else {
364+
panic(errors.AssertionFailedf("unexpected target membership status: %v", targetStatus))
365+
}
366+
event.CommonDetails().Timestamp = timeutil.Now().UnixNano()
367+
nodeDetails.RequestingNodeID = int32(s.NodeID())
368+
return event, nodeDetails
366369
}
367-
event.CommonDetails().Timestamp = timeutil.Now().UnixNano()
368-
nodeDetails.RequestingNodeID = int32(s.NodeID())
369370

370371
for _, nodeID := range nodeIDs {
371372
statusChanged, err := s.nodeLiveness.SetMembershipStatus(ctx, nodeID, targetStatus)
@@ -377,6 +378,7 @@ func (s *Server) Decommission(
377378
return grpcstatus.Errorf(codes.Internal, err.Error())
378379
}
379380
if statusChanged {
381+
event, nodeDetails := newEvent()
380382
nodeDetails.TargetNodeID = int32(nodeID)
381383
// Ensure an entry is produced in the external log in all cases.
382384
log.StructuredEvent(ctx, event)

pkg/sql/event_log.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,9 @@ const (
481481
// of the provided transaction, using the provided internal executor.
482482
//
483483
// This converts to a call to insertEventRecords() with just 1 entry.
484+
//
485+
// Note: it is not safe to pass the same entry references to multiple
486+
// subsequent calls (it causes a race condition).
484487
func InsertEventRecords(
485488
ctx context.Context, execCfg *ExecutorConfig, dst LogEventDestination, info ...logpb.EventPayload,
486489
) {
@@ -516,6 +519,9 @@ func InsertEventRecords(
516519
// - if there's at txn, after the txn commit time (i.e. we don't log
517520
// if the txn ends up aborting), using a txn commit trigger.
518521
// - otherwise (no txn), immediately.
522+
//
523+
// Note: it is not safe to pass the same entry references to multiple
524+
// subsequent calls (it causes a race condition).
519525
func insertEventRecords(
520526
ctx context.Context,
521527
execCfg *ExecutorConfig,

0 commit comments

Comments
 (0)