@@ -17,10 +17,66 @@ import (
17
17
"github.com/cockroachdb/cockroach/pkg/roachprod"
18
18
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
19
19
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
20
+ "github.com/cockroachdb/cockroach/pkg/util/syncutil"
20
21
"github.com/cockroachdb/errors"
21
22
"golang.org/x/sync/errgroup"
22
23
)
23
24
25
+ // monitorProcess represents a single process that the monitor monitors.
26
+ type monitorProcess struct {
27
+ node install.Node
28
+ virtualClusterName string
29
+ sqlInstance int
30
+ }
31
+
32
+ // MonitorExpectedProcessHealth represents the expected health of a process.
33
+ type MonitorExpectedProcessHealth string
34
+
35
+ const (
36
+ ExpectedAlive = MonitorExpectedProcessHealth ("process alive" )
37
+ ExpectedDead = MonitorExpectedProcessHealth ("process dead" )
38
+ )
39
+
40
+ // expectedProcessHealth is a concurrent map that stores the expected health of
41
+ // each registered monitorProcess. It is a thin wrapper over syncutil.Map that ensures
42
+ // consistent naming of the system interface.
43
+ type expectedProcessHealth struct {
44
+ syncutil.Map [monitorProcess , MonitorExpectedProcessHealth ]
45
+ }
46
+
47
+ func newProcess (node install.Node , virtualClusterName string , sqlInstance int ) monitorProcess {
48
+ if virtualClusterName == "" {
49
+ virtualClusterName = install .SystemInterfaceName
50
+ }
51
+ return monitorProcess {
52
+ node : node ,
53
+ virtualClusterName : virtualClusterName ,
54
+ sqlInstance : sqlInstance ,
55
+ }
56
+ }
57
+
58
+ func (m * expectedProcessHealth ) get (
59
+ node install.Node , virtualClusterName string , sqlInstance int ,
60
+ ) MonitorExpectedProcessHealth {
61
+ val , ok := m .Load (newProcess (node , virtualClusterName , sqlInstance ))
62
+ if ! ok {
63
+ // If the process has no expected state, assume it should be healthy.
64
+ return ExpectedAlive
65
+ }
66
+ return * val
67
+ }
68
+
69
+ func (m * expectedProcessHealth ) set (
70
+ nodes install.Nodes ,
71
+ virtualClusterName string ,
72
+ sqlInstance int ,
73
+ health MonitorExpectedProcessHealth ,
74
+ ) {
75
+ for _ , node := range nodes {
76
+ m .Store (newProcess (node , virtualClusterName , sqlInstance ), & health )
77
+ }
78
+ }
79
+
24
80
// monitorImpl implements the Monitor interface. A monitor both
25
81
// manages "user tasks" -- goroutines provided by tests -- as well as
26
82
// checks that every node in the cluster is still running. A failure
@@ -46,7 +102,16 @@ type monitorImpl struct {
46
102
monitorGroup * errgroup.Group // monitor goroutine
47
103
monitorOnce sync.Once // guarantees monitor goroutine is only started once
48
104
49
- expDeaths int32 // atomically
105
+ // expExactProcessDeath if true indicates that the monitor should expect that a
106
+ // specified process, as denoted by the triple in expProcessHealth.get, is dead.
107
+ // Otherwise, the monitor will expect that only a certain number of process deaths.
108
+ // The former is a stronger assertion used in the new global roachtest monitor,
109
+ // while the latter should be removed when the deprecated cluster monitor is removed.
110
+ expExactProcessDeath bool
111
+ // Deprecated: This field is used by the deprecated cluster monitor to track the number
112
+ // of expected process deaths, and should be removed when the cluster monitor is removed.
113
+ expDeaths int32 // atomically
114
+ expProcessHealth expectedProcessHealth
50
115
}
51
116
52
117
func newMonitor (
@@ -58,19 +123,46 @@ func newMonitor(
58
123
L () * logger.Logger
59
124
},
60
125
c cluster.Cluster ,
126
+ expectExactProcessDeath bool ,
61
127
opts ... option.Option ,
62
128
) * monitorImpl {
63
129
m := & monitorImpl {
64
- t : t ,
65
- l : t .L (),
66
- nodes : c .MakeNodes (opts ... ),
130
+ t : t ,
131
+ l : t .L (),
132
+ nodes : c .MakeNodes (opts ... ),
133
+ expExactProcessDeath : expectExactProcessDeath ,
134
+ expProcessHealth : expectedProcessHealth {},
67
135
}
68
136
m .ctx , m .cancel = context .WithCancel (ctx )
69
137
m .userGroup , _ = errgroup .WithContext (m .ctx )
70
138
m .monitorGroup , _ = errgroup .WithContext (m .ctx )
71
139
return m
72
140
}
73
141
142
+ func (m * monitorImpl ) ExpectProcessHealth (
143
+ nodes install.Nodes , health MonitorExpectedProcessHealth , opts ... option.OptionFunc ,
144
+ ) {
145
+ var virtualClusterOptions option.VirtualClusterOptions
146
+ if err := option .Apply (& virtualClusterOptions , opts ... ); err != nil {
147
+ m .t .Fatal (err )
148
+ }
149
+ m .expProcessHealth .set (nodes , virtualClusterOptions .VirtualClusterName , virtualClusterOptions .SQLInstance , health )
150
+ }
151
+
152
+ // ExpectProcessDeath lets the monitor know that a set of processes are about
153
+ // to be killed, and that their deaths should be ignored. Virtual cluster
154
+ // options can be passed to denote a separate process.
155
+ func (m * monitorImpl ) ExpectProcessDead (nodes option.NodeListOption , opts ... option.OptionFunc ) {
156
+ m .ExpectProcessHealth (nodes .InstallNodes (), ExpectedDead , opts ... )
157
+ }
158
+
159
+ // ExpectProcessAlive lets the monitor know that a set of processes are
160
+ // expected to be healthy. Virtual cluster options can be passed to denote
161
+ // a separate process.
162
+ func (m * monitorImpl ) ExpectProcessAlive (nodes option.NodeListOption , opts ... option.OptionFunc ) {
163
+ m .ExpectProcessHealth (nodes .InstallNodes (), ExpectedAlive , opts ... )
164
+ }
165
+
74
166
// ExpectDeath lets the monitor know that a node is about to be killed, and that
75
167
// this should be ignored.
76
168
func (m * monitorImpl ) ExpectDeath () {
@@ -190,12 +282,16 @@ func (m *monitorImpl) startNodeMonitor() {
190
282
)
191
283
}
192
284
case install.MonitorProcessDead :
193
- isExpectedDeath := atomic .AddInt32 (& m .expDeaths , - 1 ) >= 0
194
- if isExpectedDeath {
195
- expectedDeathStr = ": expected"
285
+ var isExpectedDeath bool
286
+ if m .expExactProcessDeath {
287
+ isExpectedDeath = m .expProcessHealth .get (info .Node , e .VirtualClusterName , e .SQLInstance ) == ExpectedDead
288
+ } else {
289
+ isExpectedDeath = atomic .AddInt32 (& m .expDeaths , - 1 ) >= 0
196
290
}
197
291
198
- if ! isExpectedDeath {
292
+ if isExpectedDeath {
293
+ expectedDeathStr = ": expected"
294
+ } else {
199
295
retErr = fmt .Errorf ("unexpected node event: %s" , info )
200
296
}
201
297
}
0 commit comments