@@ -17,6 +17,7 @@ import (
1717 "math"
1818 "math/rand"
1919 "reflect"
20+ "regexp"
2021 "strconv"
2122 "strings"
2223 "sync"
@@ -58,10 +59,13 @@ import (
5859 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
5960 "github.com/cockroachdb/cockroach/pkg/util/leaktest"
6061 "github.com/cockroachdb/cockroach/pkg/util/log"
62+ "github.com/cockroachdb/cockroach/pkg/util/log/logpb"
6163 "github.com/cockroachdb/cockroach/pkg/util/protoutil"
6264 "github.com/cockroachdb/cockroach/pkg/util/randutil"
6365 "github.com/cockroachdb/cockroach/pkg/util/stop"
6466 "github.com/cockroachdb/cockroach/pkg/util/timeutil"
67+ "github.com/cockroachdb/cockroach/pkg/util/tracing"
68+ "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
6569 "github.com/cockroachdb/cockroach/pkg/util/uuid"
6670 "github.com/cockroachdb/errors"
6771 "github.com/stretchr/testify/assert"
@@ -1460,6 +1464,187 @@ func (c fakeSnapshotStream) Send(request *kvserverpb.SnapshotResponse) error {
14601464 return nil
14611465}
14621466
1467+ type snapshotTestSignals struct {
1468+ // Receiver-side wait channels.
1469+ receiveErrCh chan error
1470+ batchReceiveReadyCh chan struct {}
1471+
1472+ // Sender-side wait channels.
1473+ svrContextDone <- chan struct {}
1474+ receiveStartedCh chan struct {}
1475+ batchReceiveStartedCh chan struct {}
1476+ receiverDoneCh chan struct {}
1477+ }
1478+
1479+ // TestReceiveSnapshotLogging tests that a snapshot receiver properly captures
1480+ // the collected tracing spans in the last response, or logs the span if the
1481+ // context is cancelled from the client side.
1482+ func TestReceiveSnapshotLogging (t * testing.T ) {
1483+ defer leaktest .AfterTest (t )()
1484+ defer log .Scope (t ).Close (t )
1485+
1486+ const senderNodeIdx = 0
1487+ const receiverNodeIdx = 1
1488+ const dummyEventMsg = "test receive snapshot logging - dummy event"
1489+
1490+ setupTest := func (t * testing.T ) (context.Context , * testcluster.TestCluster , * roachpb.RangeDescriptor , * snapshotTestSignals ) {
1491+ ctx := context .Background ()
1492+
1493+ signals := & snapshotTestSignals {
1494+ receiveErrCh : make (chan error ),
1495+ batchReceiveReadyCh : make (chan struct {}),
1496+
1497+ svrContextDone : nil ,
1498+ receiveStartedCh : make (chan struct {}),
1499+ batchReceiveStartedCh : make (chan struct {}),
1500+ receiverDoneCh : make (chan struct {}, 1 ),
1501+ }
1502+
1503+ tc := testcluster .StartTestCluster (t , 3 , base.TestClusterArgs {
1504+ ServerArgs : base.TestServerArgs {
1505+ Knobs : base.TestingKnobs {
1506+ Store : & kvserver.StoreTestingKnobs {
1507+ DisableRaftSnapshotQueue : true ,
1508+ },
1509+ },
1510+ },
1511+ ReplicationMode : base .ReplicationManual ,
1512+ ServerArgsPerNode : map [int ]base.TestServerArgs {
1513+ receiverNodeIdx : {
1514+ Knobs : base.TestingKnobs {
1515+ Store : & kvserver.StoreTestingKnobs {
1516+ DisableRaftSnapshotQueue : true ,
1517+ ThrottleEmptySnapshots : true ,
1518+ ReceiveSnapshot : func (ctx context.Context , _ * kvserverpb.SnapshotRequest_Header ) error {
1519+ t .Logf ("incoming snapshot on n2" )
1520+ log .Event (ctx , dummyEventMsg )
1521+ signals .svrContextDone = ctx .Done ()
1522+ close (signals .receiveStartedCh )
1523+ return <- signals .receiveErrCh
1524+ },
1525+ BeforeRecvAcceptedSnapshot : func () {
1526+ t .Logf ("receiving on n2" )
1527+ signals .batchReceiveStartedCh <- struct {}{}
1528+ <- signals .batchReceiveReadyCh
1529+ },
1530+ HandleSnapshotDone : func () {
1531+ t .Logf ("receiver on n2 completed" )
1532+ signals .receiverDoneCh <- struct {}{}
1533+ },
1534+ },
1535+ },
1536+ },
1537+ },
1538+ })
1539+
1540+ _ , scratchRange , err := tc .Servers [0 ].ScratchRangeEx ()
1541+ require .NoError (t , err )
1542+
1543+ return ctx , tc , & scratchRange , signals
1544+ }
1545+
1546+ snapshotAndValidateLogs := func (t * testing.T , ctx context.Context , tc * testcluster.TestCluster , rngDesc * roachpb.RangeDescriptor , signals * snapshotTestSignals , expectTraceOnSender bool ) error {
1547+ t .Helper ()
1548+
1549+ repl := tc .GetFirstStoreFromServer (t , senderNodeIdx ).LookupReplica (rngDesc .StartKey )
1550+ chgs := kvpb .MakeReplicationChanges (roachpb .ADD_VOTER , tc .Target (receiverNodeIdx ))
1551+
1552+ testStartTs := timeutil .Now ()
1553+ _ , pErr := repl .ChangeReplicas (ctx , rngDesc , kvserverpb .SnapshotRequest_REBALANCE , kvserverpb .ReasonRangeUnderReplicated , "" , chgs )
1554+
1555+ // When ready, flush logs and check messages from store_raft.go since
1556+ // call to repl.ChangeReplicas(..).
1557+ <- signals .receiverDoneCh
1558+ log .Flush ()
1559+ entries , err := log .FetchEntriesFromFiles (testStartTs .UnixNano (),
1560+ math .MaxInt64 , 100 , regexp .MustCompile (`store_raft\.go` ), log .WithMarkedSensitiveData )
1561+ require .NoError (t , err )
1562+
1563+ errRegexp , err := regexp .Compile (`incoming snapshot stream failed with error` )
1564+ require .NoError (t , err )
1565+ foundEntry := false
1566+ var entry logpb.Entry
1567+ for _ , entry = range entries {
1568+ if errRegexp .MatchString (entry .Message ) {
1569+ foundEntry = true
1570+ break
1571+ }
1572+ }
1573+ expectTraceOnReceiver := ! expectTraceOnSender
1574+ require .Equal (t , expectTraceOnReceiver , foundEntry )
1575+ if expectTraceOnReceiver {
1576+ require .Contains (t , entry .Message , dummyEventMsg )
1577+ }
1578+
1579+ // Check that receiver traces were imported in sender's context on success.
1580+ clientTraces := tracing .SpanFromContext (ctx ).GetConfiguredRecording ()
1581+ _ , receiverTraceFound := clientTraces .FindLogMessage (dummyEventMsg )
1582+ require .Equal (t , expectTraceOnSender , receiverTraceFound )
1583+
1584+ return pErr
1585+ }
1586+
1587+ t .Run ("cancel on header" , func (t * testing.T ) {
1588+ ctx , tc , scratchRange , signals := setupTest (t )
1589+ defer tc .Stopper ().Stop (ctx )
1590+
1591+ ctx , sp := tracing .EnsureChildSpan (ctx , tc .GetFirstStoreFromServer (t , senderNodeIdx ).GetStoreConfig ().Tracer (),
1592+ t .Name (), tracing .WithRecording (tracingpb .RecordingVerbose ))
1593+ defer sp .Finish ()
1594+
1595+ ctx , cancel := context .WithCancel (ctx )
1596+ go func () {
1597+ <- signals .receiveStartedCh
1598+ cancel ()
1599+ <- signals .svrContextDone
1600+ time .Sleep (10 * time .Millisecond )
1601+ signals .receiveErrCh <- errors .Errorf ("header is bad" )
1602+ }()
1603+ err := snapshotAndValidateLogs (t , ctx , tc , scratchRange , signals , false /* expectTraceOnSender */ )
1604+ require .Error (t , err )
1605+ })
1606+ t .Run ("cancel during receive" , func (t * testing.T ) {
1607+ ctx , tc , scratchRange , signals := setupTest (t )
1608+ defer tc .Stopper ().Stop (ctx )
1609+
1610+ ctx , sp := tracing .EnsureChildSpan (ctx , tc .GetFirstStoreFromServer (t , senderNodeIdx ).GetStoreConfig ().Tracer (),
1611+ t .Name (), tracing .WithRecording (tracingpb .RecordingVerbose ))
1612+ defer sp .Finish ()
1613+
1614+ ctx , cancel := context .WithCancel (ctx )
1615+ close (signals .receiveErrCh )
1616+ go func () {
1617+ <- signals .receiveStartedCh
1618+ <- signals .batchReceiveStartedCh
1619+ cancel ()
1620+ <- signals .svrContextDone
1621+ time .Sleep (10 * time .Millisecond )
1622+ close (signals .batchReceiveReadyCh )
1623+ }()
1624+ err := snapshotAndValidateLogs (t , ctx , tc , scratchRange , signals , false /* expectTraceOnSender */ )
1625+ require .Error (t , err )
1626+ })
1627+ t .Run ("successful send" , func (t * testing.T ) {
1628+ ctx , tc , scratchRange , signals := setupTest (t )
1629+ defer tc .Stopper ().Stop (ctx )
1630+
1631+ ctx , sp := tracing .EnsureChildSpan (ctx , tc .GetFirstStoreFromServer (t , senderNodeIdx ).GetStoreConfig ().Tracer (),
1632+ t .Name (), tracing .WithRecording (tracingpb .RecordingVerbose ))
1633+ defer sp .Finish ()
1634+
1635+ ctx , cancel := context .WithCancel (ctx )
1636+ defer cancel ()
1637+ close (signals .receiveErrCh )
1638+ close (signals .batchReceiveReadyCh )
1639+ go func () {
1640+ <- signals .receiveStartedCh
1641+ <- signals .batchReceiveStartedCh
1642+ }()
1643+ err := snapshotAndValidateLogs (t , ctx , tc , scratchRange , signals , true /* expectTraceOnSender */ )
1644+ require .NoError (t , err )
1645+ })
1646+ }
1647+
14631648// TestFailedSnapshotFillsReservation tests that failing to finish applying an
14641649// incoming snapshot still cleans up the outstanding reservation that was made.
14651650func TestFailedSnapshotFillsReservation (t * testing.T ) {
0 commit comments