Skip to content

Commit d93799f

Browse files
craig[bot]AlexTalks
andcommitted
107596: kvcoord: disable fatal assertion on transaction commit sanity check r=AlexTalks a=AlexTalks As cockroachdb#103817 is now a known issue for which a root cause has been identified, the sanity check assertion in place here is disabled, with errors occuring due to the known bug properly linking to the issue ticket. While the transaction coordinator still performs sanity checks to ensure that operations cannot continue on an already-committed transaction, it will no longer log the error with severity `FATAL`, resulting in node crash, and will instead simply return the error. Part of: cockroachdb#103817 Release note: None Co-authored-by: Alex Sarkesian <[email protected]>
2 parents 389d3f0 + 1c906bb commit d93799f

File tree

3 files changed

+13
-29
lines changed

3 files changed

+13
-29
lines changed

pkg/kv/kvclient/kvcoord/replayed_commit_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func (f *interceptingTransport) SendNext(
4949
// in which DistSender retries an (unbeknownst to it) successful EndTxn(commit=true)
5050
// RPC. It documents that this triggers an assertion failure in TxnCoordSender.
5151
//
52-
// See: https://github.com/cockroachdb/cockroach/issues/67765
52+
// See: https://github.com/cockroachdb/cockroach/issues/103817
5353
func TestCommitSanityCheckAssertionFiresOnUndetectedAmbiguousCommit(t *testing.T) {
5454
defer leaktest.AfterTest(t)()
5555
defer log.Scope(t).Close(t)
@@ -89,8 +89,6 @@ func TestCommitSanityCheckAssertionFiresOnUndetectedAmbiguousCommit(t *testing.T
8989
},
9090
}, nil
9191
},
92-
// Turn the assertion into an error returned via the txn.
93-
DisableCommitSanityCheck: true,
9492
}
9593

9694
tc := testcluster.StartTestCluster(t, 1, args)

pkg/kv/kvclient/kvcoord/testing_knobs.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,6 @@ type ClientTestingKnobs struct {
4848
// only applies to requests sent with the LEASEHOLDER routing policy.
4949
DontReorderReplicas bool
5050

51-
// DisableCommitSanityCheck allows "setting" the DisableCommitSanityCheck to
52-
// true without actually overriding the variable.
53-
DisableCommitSanityCheck bool
54-
5551
// CommitWaitFilter allows tests to instrument the beginning of a transaction
5652
// commit wait sleep.
5753
CommitWaitFilter func()

pkg/kv/kvclient/kvcoord/txn_coord_sender.go

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation"
2020
"github.com/cockroachdb/cockroach/pkg/roachpb"
2121
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
22-
"github.com/cockroachdb/cockroach/pkg/util/envutil"
2322
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2423
"github.com/cockroachdb/cockroach/pkg/util/log"
2524
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
@@ -35,12 +34,6 @@ const (
3534
OpTxnCoordSender = "txn coordinator send"
3635
)
3736

38-
// DisableCommitSanityCheck allows opting out of a fatal assertion error that was observed in the wild
39-
// and for which a root cause is not yet available.
40-
//
41-
// See: https://github.com/cockroachdb/cockroach/pull/73512.
42-
var DisableCommitSanityCheck = envutil.EnvOrDefaultBool("COCKROACH_DISABLE_COMMIT_SANITY_CHECK", false)
43-
4437
// txnState represents states relating to whether an EndTxn request needs
4538
// to be sent.
4639
//
@@ -969,13 +962,14 @@ func (tc *TxnCoordSender) updateStateLocked(
969962
// encounter such errors. Marking a transaction as explicitly-committed can also
970963
// encounter these errors, but those errors don't make it to the TxnCoordSender.
971964
//
972-
// Returns the passed-in error or fatals (depending on DisableCommitSanityCheck
973-
// env var), wrapping the input error in case of an assertion violation.
965+
// Wraps the error in case of an assertion violation, otherwise returns as-is.
974966
//
975-
// The assertion is known to have failed in the wild, see:
976-
// https://github.com/cockroachdb/cockroach/issues/67765
967+
// This checks for the occurence of a known issue involving ambiguous write
968+
// errors that occur alongside commits, which may race with transaction
969+
// recovery requests started by contending operations.
970+
// https://github.com/cockroachdb/cockroach/issues/103817
977971
func sanityCheckErrWithTxn(
978-
ctx context.Context, pErrWithTxn *kvpb.Error, ba *kvpb.BatchRequest, knobs *ClientTestingKnobs,
972+
ctx context.Context, pErrWithTxn *kvpb.Error, ba *kvpb.BatchRequest, _ *ClientTestingKnobs,
979973
) error {
980974
txn := pErrWithTxn.GetTxn()
981975
if txn.Status != roachpb.COMMITTED {
@@ -988,24 +982,20 @@ func sanityCheckErrWithTxn(
988982
return nil
989983
}
990984

991-
// Finding out about our transaction being committed indicates a serious bug.
992-
// Requests are not supposed to be sent on transactions after they are
993-
// committed.
985+
// Finding out about our transaction being committed indicates a serious but
986+
// known bug. Requests are not supposed to be sent on transactions after they
987+
// are committed.
994988
err := errors.Wrapf(pErrWithTxn.GoError(),
995989
"transaction unexpectedly committed, ba: %s. txn: %s",
996990
ba, pErrWithTxn.GetTxn(),
997991
)
998992
err = errors.WithAssertionFailure(
999993
errors.WithIssueLink(err, errors.IssueLink{
1000-
IssueURL: "https://github.com/cockroachdb/cockroach/issues/67765",
994+
IssueURL: "https://github.com/cockroachdb/cockroach/issues/103817",
1001995
Detail: "you have encountered a known bug in CockroachDB, please consider " +
1002-
"reporting on the Github issue or reach out via Support. " +
1003-
"This assertion can be disabled by setting the environment variable " +
1004-
"COCKROACH_DISABLE_COMMIT_SANITY_CHECK=true",
996+
"reporting on the Github issue or reach out via Support.",
1005997
}))
1006-
if !DisableCommitSanityCheck && !knobs.DisableCommitSanityCheck {
1007-
log.Fatalf(ctx, "%s", err)
1008-
}
998+
log.Warningf(ctx, "%v", err)
1009999
return err
10101000
}
10111001

0 commit comments

Comments
 (0)