@@ -43,13 +43,12 @@ import (
43
43
"github.com/cockroachdb/cockroach/pkg/sql/types"
44
44
"github.com/cockroachdb/cockroach/pkg/testutils"
45
45
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
46
- "github.com/cockroachdb/cockroach/pkg/testutils/skip"
47
46
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
48
- "github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
49
47
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
50
48
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
51
49
"github.com/cockroachdb/cockroach/pkg/util/log"
52
50
"github.com/cockroachdb/cockroach/pkg/util/mon"
51
+ "github.com/cockroachdb/cockroach/pkg/util/protoutil"
53
52
"github.com/cockroachdb/errors"
54
53
"github.com/stretchr/testify/require"
55
54
"google.golang.org/protobuf/proto"
@@ -576,61 +575,55 @@ INSERT INTO foo VALUES (1), (10), (100);
576
575
func TestIndexBackfillerResumePreservesProgress (t * testing.T ) {
577
576
defer leaktest .AfterTest (t )()
578
577
defer log .Scope (t ).Close (t )
579
- skip .UnderStress (t , "timing-sensitive test" )
580
578
581
579
ctx := context .Background ()
582
- backfillProgressUpdateCh := make (chan struct {})
583
580
backfillProgressCompletedCh := make (chan []roachpb.Span )
584
- checkpointedSpansCh := make (chan []roachpb.Span )
585
581
const numSpans = 100
586
582
var isBlockingBackfillProgress atomic.Bool
587
583
isBlockingBackfillProgress .Store (true )
588
- var isBlockingCheckpoint atomic.Bool
589
584
590
- clusterArgs := base.TestClusterArgs {
591
- ServerArgs : base.TestServerArgs {
592
- Knobs : base.TestingKnobs {
593
- DistSQL : & execinfra.TestingKnobs {
594
- // We want to push progress every batch_size rows to control
595
- // the backfill incrementally.
596
- BulkAdderFlushesEveryBatch : true ,
597
- RunBeforeIndexBackfillProgressUpdate : func (completed []roachpb.Span ) {
598
- if isBlockingBackfillProgress .Load () {
599
- <- backfillProgressUpdateCh
600
- backfillProgressCompletedCh <- completed
585
+ // Start the server with testing knob.
586
+ tc , db , _ := serverutils .StartServer (t , base.TestServerArgs {
587
+ Knobs : base.TestingKnobs {
588
+ JobsTestingKnobs : jobs .NewTestingKnobsWithShortIntervals (),
589
+ DistSQL : & execinfra.TestingKnobs {
590
+ // We want to push progress every batch_size rows to control
591
+ // the backfill incrementally.
592
+ BulkAdderFlushesEveryBatch : true ,
593
+ RunBeforeIndexBackfillProgressUpdate : func (ctx context.Context , completed []roachpb.Span ) {
594
+ if isBlockingBackfillProgress .Load () {
595
+ select {
596
+ case <- ctx .Done ():
597
+ case backfillProgressCompletedCh <- completed :
598
+ t .Logf ("before index backfill progress update, completed spans: %v" , completed )
601
599
}
602
- },
600
+ }
603
601
},
604
- SQLDeclarativeSchemaChanger : & scexec. TestingKnobs {
605
- RunBeforeBackfill : func ( progresses [] scexec.BackfillProgress ) error {
606
- if isBlockingCheckpoint . Load () && progresses != nil && progresses [ 0 ]. CompletedSpans != nil {
607
- checkpointedSpansCh <- progresses [ 0 ]. CompletedSpans
608
- }
609
- return nil
610
- },
602
+ },
603
+ SQLDeclarativeSchemaChanger : & scexec.TestingKnobs {
604
+ RunBeforeBackfill : func ( progresses []scexec. BackfillProgress ) error {
605
+ if progresses != nil {
606
+ t . Logf ( "before resuming backfill, checkpointed spans: %v" , progresses [ 0 ]. CompletedSpans )
607
+ }
608
+ return nil
611
609
},
612
610
},
613
611
},
614
- }
615
- // Start the server with a testing knob
616
- tc := testcluster .NewTestCluster (t , 1 , clusterArgs )
617
- tc .Start (t )
612
+ })
618
613
defer tc .Stopper ().Stop (ctx )
619
- db := tc .Conns [0 ]
620
614
621
615
_ , err := db .Exec (`SET CLUSTER SETTING bulkio.index_backfill.batch_size = 10` )
622
616
require .NoError (t , err )
623
617
// Ensure that we checkpoint our progress to the backfill job so that
624
618
// RESUMEs can get an up-to-date backfill progress.
625
- _ , err = db .Exec (`SET CLUSTER SETTING bulkio.index_backfill.checkpoint_interval = '0s '` )
619
+ _ , err = db .Exec (`SET CLUSTER SETTING bulkio.index_backfill.checkpoint_interval = '10ms '` )
626
620
require .NoError (t , err )
627
621
_ , err = db .Exec (`CREATE TABLE t(i INT PRIMARY KEY)` )
628
622
require .NoError (t , err )
629
623
_ , err = db .Exec (`INSERT INTO t SELECT generate_series(1, $1)` , numSpans )
630
624
require .NoError (t , err )
631
625
_ , err = db .Exec (`ALTER TABLE t SPLIT AT TABLE generate_series(1, $1)` , numSpans )
632
626
require .NoError (t , err )
633
- require .NoError (t , tc .WaitForFullReplication ())
634
627
var descID catid.DescID
635
628
descIDRow := db .QueryRow (`SELECT 't'::regclass::oid` )
636
629
err = descIDRow .Scan (& descID )
@@ -646,112 +639,118 @@ func TestIndexBackfillerResumePreservesProgress(t *testing.T) {
646
639
return nil
647
640
})
648
641
649
- g .GoCtx (func (ctx context.Context ) error {
650
- testutils .SucceedsSoon (t , func () error {
651
- jobIDRow := db .QueryRow (`
642
+ testutils .SucceedsWithin (t , func () error {
643
+ jobIDRow := db .QueryRow (`
652
644
SELECT job_id FROM [SHOW JOBS]
653
- WHERE job_type = 'NEW SCHEMA CHANGE' AND description ILIKE '%ADD COLUMN%'` ,
654
- )
655
- if err := jobIDRow .Scan (& jobID ); err != nil {
645
+ WHERE job_type = 'NEW SCHEMA CHANGE' AND description ILIKE '%ADD COLUMN j%'` ,
646
+ )
647
+ if err := jobIDRow .Scan (& jobID ); err != nil {
648
+ return err
649
+ }
650
+ return nil
651
+ }, 5 * time .Second )
652
+
653
+ ensureJobState := func (targetState string ) {
654
+ testutils .SucceedsWithin (t , func () error {
655
+ var jobState string
656
+ statusRow := db .QueryRow (`SELECT status FROM [SHOW JOB $1]` , jobID )
657
+ if err := statusRow .Scan (& jobState ); err != nil {
656
658
return err
657
659
}
660
+ if jobState != targetState {
661
+ return errors .Errorf ("expected job to be %s, but found status: %s" ,
662
+ targetState , jobState )
663
+ }
658
664
return nil
659
- })
665
+ }, 5 * time .Second )
666
+ }
660
667
661
- ensureJobState := func (targetState string ) {
662
- testutils .SucceedsSoon (t , func () error {
663
- var jobState string
664
- statusRow := db .QueryRow (`SELECT status FROM [SHOW JOB $1]` , jobID )
665
- if err := statusRow .Scan (& jobState ); err != nil {
666
- return err
668
+ var completedSpans roachpb.SpanGroup
669
+ receiveProgressUpdate := func () {
670
+ progressUpdate := <- backfillProgressCompletedCh
671
+
672
+ // Make sure the progress update does not contain overlapping spans.
673
+ for i , span1 := range progressUpdate {
674
+ for j , span2 := range progressUpdate {
675
+ if i <= j {
676
+ continue
667
677
}
668
- if jobState != targetState {
669
- return errors .Errorf ("expected job to be %s, but found status: %s" ,
670
- targetState , jobState )
678
+ if span1 .Overlaps (span2 ) {
679
+ t .Fatalf ("progress update contains overlapping spans: %s and %s" , span1 , span2 )
671
680
}
672
- return nil
673
- })
681
+ }
674
682
}
683
+ completedSpans .Add (progressUpdate ... )
684
+ }
675
685
676
- // Let the backfill step forward a bit before we do our PAUSE/RESUME
677
- // dance.
678
- var spansCompletedBeforePause []roachpb. Span
679
- for i := 0 ; i < 2 ; i ++ {
680
- backfillProgressUpdateCh <- struct {}{}
681
- spansCompletedBeforePause = <- backfillProgressCompletedCh
682
- }
686
+ ensureCompletedSpansAreCheckpointed := func () {
687
+ testutils . SucceedsWithin ( t , func () error {
688
+ stmt := `SELECT payload FROM crdb_internal.system_jobs WHERE id = $1`
689
+ var payloadBytes [] byte
690
+ if err := db . QueryRowContext ( ctx , stmt , jobID ). Scan ( & payloadBytes ); err != nil {
691
+ return err
692
+ }
683
693
684
- _ , err := db .Exec (`PAUSE JOB $1` , jobID )
685
- if err != nil {
686
- return err
687
- }
688
- ensureJobState ("paused" )
694
+ payload := & jobspb.Payload {}
695
+ if err := protoutil .Unmarshal (payloadBytes , payload ); err != nil {
696
+ return err
697
+ }
689
698
690
- _ , err = db .Exec (`RESUME JOB $1` , jobID )
691
- if err != nil {
692
- return err
693
- }
694
- ensureJobState ("running" )
699
+ schemaChangeProgress := * (payload .Details .(* jobspb.Payload_NewSchemaChange ).NewSchemaChange )
700
+ var checkpointedSpans []roachpb.Span
701
+ if len (schemaChangeProgress .BackfillProgress ) > 0 {
702
+ checkpointedSpans = schemaChangeProgress .BackfillProgress [0 ].CompletedSpans
703
+ }
704
+ var sg roachpb.SpanGroup
705
+ sg .Add (checkpointedSpans ... )
706
+ // Ensure that the spans we already completed are fully contained in our
707
+ // checkpointed completed spans group.
708
+ if ! sg .Encloses (completedSpans .Slice ()... ) {
709
+ return errors .Errorf ("checkpointed spans %v do not enclose completed spans %v" ,
710
+ checkpointedSpans , completedSpans .Slice ())
711
+ }
695
712
696
- for i := 0 ; i < 2 ; i ++ {
697
- backfillProgressUpdateCh <- struct {}{}
698
- <- backfillProgressCompletedCh
699
- }
700
- isBlockingCheckpoint .Store (true )
713
+ return nil
714
+ }, 5 * time .Second )
715
+ }
701
716
702
- _ , err = db . Exec ( `PAUSE JOB $1` , jobID )
703
- if err != nil {
704
- return err
705
- }
706
- ensureJobState ( "paused" )
717
+ // Let the backfill step forward a bit before we do our PAUSE/RESUME
718
+ // dance.
719
+ for i := 0 ; i < 2 ; i ++ {
720
+ receiveProgressUpdate ()
721
+ }
707
722
708
- _ , err = db .Exec (`RESUME JOB $1` , jobID )
709
- if err != nil {
710
- return err
711
- }
712
- ensureJobState ("running" )
713
-
714
- var wg sync.WaitGroup
715
- wg .Add (1 )
716
- go func () {
717
- defer wg .Done ()
718
- timer := time .NewTimer (1 * time .Minute )
719
- defer timer .Stop ()
720
- select {
721
- case checkpointed := <- checkpointedSpansCh :
722
- isBlockingCheckpoint .Store (false )
723
- var sg roachpb.SpanGroup
724
- sg .Add (checkpointed ... )
725
- // Ensure that the spans we completed before any PAUSE is
726
- // fully contained in our checkpointed completed spans group.
727
- require .True (t , sg .Encloses (spansCompletedBeforePause ... ))
728
- case <- timer .C :
729
- require .Fail (t , "timed out waiting for checkpoint" )
730
- }
731
- }()
732
-
733
- wg .Add (1 )
734
- go func () {
735
- defer wg .Done ()
736
-
737
- isBlockingBackfillProgress .Store (false )
738
- timer := time .NewTimer (1 * time .Minute )
739
- defer timer .Stop ()
740
- select {
741
- case backfillProgressUpdateCh <- struct {}{}:
742
- <- backfillProgressCompletedCh
743
- case <- timer .C :
744
- require .Fail (t , "timed out waiting for backfill progress" )
745
- }
746
- }()
723
+ ensureCompletedSpansAreCheckpointed ()
724
+ t .Logf ("pausing backfill" )
725
+ _ , err = db .Exec (`PAUSE JOB $1` , jobID )
726
+ require .NoError (t , err )
727
+ ensureJobState ("paused" )
747
728
748
- // Wait for both operations to complete
749
- wg .Wait ()
729
+ t .Logf ("resuming backfill" )
730
+ _ , err = db .Exec (`RESUME JOB $1` , jobID )
731
+ require .NoError (t , err )
732
+ ensureJobState ("running" )
750
733
751
- // Now we can wait for the job to succeed
752
- ensureJobState ("succeeded" )
753
- return nil
754
- })
734
+ // Step forward again before re-pausing.
735
+ for i := 0 ; i < 2 ; i ++ {
736
+ receiveProgressUpdate ()
737
+ }
738
+
739
+ ensureCompletedSpansAreCheckpointed ()
740
+ isBlockingBackfillProgress .Store (false )
741
+
742
+ t .Logf ("pausing backfill" )
743
+ _ , err = db .Exec (`PAUSE JOB $1` , jobID )
744
+ require .NoError (t , err )
745
+ ensureJobState ("paused" )
746
+
747
+ t .Logf ("resuming backfill" )
748
+ _ , err = db .Exec (`RESUME JOB $1` , jobID )
749
+ require .NoError (t , err )
750
+ ensureJobState ("running" )
751
+
752
+ // Now we can wait for the job to succeed
753
+ ensureJobState ("succeeded" )
755
754
756
755
if err = g .Wait (); err != nil {
757
756
require .NoError (t , err )
0 commit comments