Skip to content

Commit 195de37

Browse files
craig[bot]DarrylWong
andcommitted
Merge #155262
155262: roachtest: fix maybeSaveClusterDueToInvariantProblems r=herkolategan a=DarrylWong This check previously called c.Nodes, which constructs a node list with the passed in options. With nothing passed, this constructs an empty node list which caused this check to always return. Fixes: none Epic: none Release note: none --------- Was experimenting with taking disk snapshots and struggling to get it to work. Cross referenced with this and realized its been silently skipping the check 😞 Co-authored-by: DarrylWong <[email protected]>
2 parents c81963c + 973d78a commit 195de37

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

pkg/cmd/roachtest/test_runner.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,7 +1814,7 @@ func gatherFatalNodeLogs(t *testImpl, testLogger *logger.Logger) (string, error)
18141814
func (r *testRunner) maybeSaveClusterDueToInvariantProblems(
18151815
ctx context.Context, t *testImpl, c *clusterImpl,
18161816
) {
1817-
if len(c.Nodes()) == 0 {
1817+
if len(c.All()) == 0 {
18181818
return // test only
18191819
}
18201820
dets, err := c.RunWithDetails(ctx, t.L(), option.WithNodes(c.All()),
@@ -1834,8 +1834,11 @@ func (r *testRunner) maybeSaveClusterDueToInvariantProblems(
18341834
for _, det := range dets {
18351835
if det.Stdout != "" {
18361836
_ = c.Extend(ctx, 7*24*time.Hour, t.L())
1837-
timestamp := timeutil.Now().Format("20060102_150405")
1838-
snapName := fmt.Sprintf("invariant-problem-%s-%s", c.Name(), timestamp)
1837+
timestamp := timeutil.Now().UnixMilli()
1838+
// We take the risk that two tests could attempt to create a snapshot
1839+
// at the same exact millisecond, as we have a 63 character limit on
1840+
// the name and the cluster name usually exceeds this by itself.
1841+
snapName := fmt.Sprintf("invariant-problem-%d", timestamp)
18391842
if _, err := c.CreateSnapshot(ctx, snapName); err != nil {
18401843
t.L().Printf("failed to create snapshot %q: %s", snapName, err)
18411844
snapName = "<failed>"

pkg/cmd/roachtest/tests/invariant_check_detection.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func registerInvariantCheckDetection(r registry.Registry) {
3030
Name: fmt.Sprintf("invariant-check-detection/failed=%t", failed),
3131
Owner: registry.OwnerTestEng,
3232
Suites: registry.ManualOnly,
33-
Cluster: spec.ClusterSpec{NodeCount: 1, CPUs: 4, ReusePolicy: spec.ReusePolicyNone{}},
33+
Cluster: r.MakeClusterSpec(1, spec.CPU(4), spec.ReuseNone(), spec.VolumeSize(100), spec.GCEVolumeType("pd-ssd")),
3434
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
3535
runInvariantCheckDetection(ctx, t, c, failed)
3636
},

pkg/roachprod/vm/gce/gcloud.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,16 @@ func (p *Provider) ListVolumes(l *logger.Logger, v *vm.VM) ([]vm.Volume, error)
874874
}
875875
}
876876

877+
// Unfortunately the above responses contain no common identifier to join
878+
// on, so we must resort to indexing. This does not work if the number of
879+
// attached disks is different from the number of described volumes, i.e.
880+
// if the cluster is using local SSDs.
881+
if len(attachedDisks) != len(describedVolumes) {
882+
err := errors.Newf("expected to find the same number of attached disks (%d) as described volumes (%d)",
883+
len(attachedDisks), len(describedVolumes))
884+
return nil, errors.WithHint(err, "is the cluster using local SSD?")
885+
}
886+
877887
var volumes []vm.Volume
878888
for idx := range attachedDisks {
879889
attachedDisk := attachedDisks[idx]

0 commit comments

Comments
 (0)