@@ -22,6 +22,7 @@ import (
22
22
"fmt"
23
23
"io"
24
24
"os"
25
+ "path"
25
26
"slices"
26
27
"sort"
27
28
"strings"
@@ -14398,6 +14399,117 @@ func TestValidateKeyspace(t *testing.T) {
14398
14399
}
14399
14400
}
14400
14401
14402
+ // This test validates the bug described in https://github.com/vitessio/vitess/issues/18113.
14403
+ func TestValidateKeyspaceErrors (t * testing.T ) {
14404
+ ctx , cancel := context .WithCancel (context .Background ())
14405
+ defer cancel ()
14406
+ cell := "zone-1"
14407
+ ks := "ks"
14408
+ shard := "-"
14409
+ // Setup the topo server and the vtctld server.
14410
+ ts := memorytopo .NewServer (ctx , cell )
14411
+ tmc := & testutil.TabletManagerClient {}
14412
+ vtctldServer := NewTestVtctldServer (ts , tmc )
14413
+ conn , err := ts .ConnForCell (ctx , cell )
14414
+ require .NoError (t , err )
14415
+
14416
+ // Create the keyspace and shard record
14417
+ _ , err = ts .GetOrCreateShard (ctx , ks , shard )
14418
+ require .NoError (t , err )
14419
+
14420
+ // verify ValidateKeyspace finds all the issues without hanging
14421
+ resp := validateKeyspace (t , vtctldServer , ctx , ks )
14422
+ // We expect two results -
14423
+ // 1. no primary for shard ks/-
14424
+ // 2. no primary in shard record ks/-
14425
+ require .Len (t , resp .GetResultsByShard ()[shard ].Results , 2 )
14426
+
14427
+ // Next we test the worst case for ValidateKeyspace wherein everything creates
14428
+ // issues to ensure we never block on writing to the results field.
14429
+
14430
+ // We first create all tablets. We intentionally create a mismatch in the tablet alias
14431
+ // file name and the tablet alias actually stored so that we see the error in Validate.
14432
+ numTablets := 10
14433
+ tablets := make ([]* topodatapb.Tablet , numTablets )
14434
+ for i := 0 ; i < numTablets ; i ++ {
14435
+ typ := topodatapb .TabletType_REPLICA
14436
+ if i == 0 {
14437
+ typ = topodatapb .TabletType_PRIMARY
14438
+ }
14439
+ tablets [i ] = & topodatapb.Tablet {
14440
+ Keyspace : ks ,
14441
+ Shard : shard ,
14442
+ Type : typ ,
14443
+ Alias : & topodatapb.TabletAlias {
14444
+ Cell : cell ,
14445
+ Uid : uint32 (100 + i ),
14446
+ },
14447
+ }
14448
+ err = ts .CreateTablet (ctx , tablets [i ])
14449
+ require .NoError (t , err )
14450
+
14451
+ // We're explicitly messing up the record, so we can't use
14452
+ // the UpdateTablet function as it doesn't not allow this.
14453
+ tabletPath := path .Join (topo .TabletsPath , topoproto .TabletAliasString (tablets [i ].Alias ), topo .TabletFile )
14454
+ tablets [i ].Alias .Uid += 100
14455
+ data , err := tablets [i ].MarshalVT ()
14456
+ require .NoError (t , err )
14457
+ _ , err = conn .Update (ctx , tabletPath , data , nil )
14458
+ require .NoError (t , err )
14459
+ // Reset the values for later use.
14460
+ tablets [i ].Alias .Uid -= 100
14461
+ }
14462
+
14463
+ // verify ValidateKeyspace finds all the issues without hanging
14464
+ resp = validateKeyspace (t , vtctldServer , ctx , ks )
14465
+ // This time we expect the following results -
14466
+ // 1. primary mismatch for shard ks/-: found zone-1-0000000100, expected <nil>
14467
+ // 2. no primary in shard record ks/-
14468
+ // 3. topo.Validate for all numTablets tablets.
14469
+ // 4. tabletmanager ping for all numTablets tablets with a general error.
14470
+ require .Len (t , resp .GetResultsByShard ()[shard ].Results , 2 * numTablets + 2 , strings .Join (resp .GetResultsByShard ()[shard ].Results , "," ))
14471
+
14472
+ // We now fix the shard record so that we can get even more errors.
14473
+ _ , err = ts .UpdateShardFields (ctx , ks , shard , func (info * topo.ShardInfo ) error {
14474
+ info .PrimaryAlias = tablets [0 ].Alias
14475
+ return nil
14476
+ })
14477
+ require .NoError (t , err )
14478
+
14479
+ tmc .GetReplicasResults = map [string ]struct {
14480
+ Replicas []string
14481
+ Error error
14482
+ }{
14483
+ "zone-1-0000000200" : {
14484
+ // We return numTablets-1 values for the replica tablets.
14485
+ Replicas : make ([]string , numTablets - 1 ),
14486
+ },
14487
+ }
14488
+
14489
+ // verify ValidateKeyspace finds all the issues without hanging
14490
+ resp = validateKeyspace (t , vtctldServer , ctx , ks )
14491
+ // This time we expect the following results results -
14492
+ // 1. topo.Validate for all numTablets tablets.
14493
+ // 2. Failure in resolving the hostname for all numTablets tablets.
14494
+ // 3. Replicas not being in the replication graph (since we couldn't get their hostnames). This is for all the replicas -> numTablets - 1
14495
+ // 4. Resolving hostname again for all the replicas -> numTablets - 1
14496
+ // 5. tabletmanager ping for all numTablets tablets with a general error.
14497
+ require .Len (t , resp .GetResultsByShard ()[shard ].Results , 5 * numTablets - 2 , strings .Join (resp .GetResultsByShard ()[shard ].Results , "," ))
14498
+ }
14499
+
14500
+ // validateKeyspace is a helper function for testing.
14501
+ func validateKeyspace (t * testing.T , vtctldServer * VtctldServer , ctx context.Context , ks string ) * vtctldatapb.ValidateKeyspaceResponse {
14502
+ t .Helper ()
14503
+ resp , err := vtctldServer .ValidateKeyspace (ctx , & vtctldatapb.ValidateKeyspaceRequest {
14504
+ Keyspace : ks ,
14505
+ PingTablets : true ,
14506
+ })
14507
+ require .NoError (t , err )
14508
+ require .NotNil (t , resp )
14509
+ require .NotNil (t , resp .GetResultsByShard ())
14510
+ return resp
14511
+ }
14512
+
14401
14513
func TestValidateShard (t * testing.T ) {
14402
14514
t .Parallel ()
14403
14515
0 commit comments