Skip to content

Commit b0d9ca4

Browse files
authored
Fix bug in F3 stop and use start/stop in catchup tests (#1029)
Fix a bug where F3.Stop was never calling `stopInternal` which actually resets the state and stops the internal components. Remove the testing API exposed for pause resume in favor of stop start. Because, now that manifest is final, there's no pause/resume and the scenario that was being tested is irrelevant. Instead, stop the nodes completely then start them back up to assert that they do catch up. The flaky test was repeated for 10 minutes on CI without failing. Fixes: #1025
1 parent 20d68df commit b0d9ca4

File tree

3 files changed

+17
-32
lines changed

3 files changed

+17
-32
lines changed

f3.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,9 @@ func (m *F3) Start(startCtx context.Context) (_err error) {
201201
}
202202

203203
// Stop F3.
204-
func (m *F3) Stop(context.Context) (_err error) {
204+
func (m *F3) Stop(ctx context.Context) (_err error) {
205205
m.cancelCtx()
206-
return nil
206+
return m.stopInternal(ctx)
207207
}
208208

209209
func (s *f3State) stop(ctx context.Context) (err error) {

f3_helpers_test.go

Lines changed: 0 additions & 16 deletions
This file was deleted.

f3_test.go

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func TestF3WithLookback(t *testing.T) {
9494
env.requireInstanceEventually(5, eventualCheckTimeout, true)
9595
}
9696

97-
func TestF3PauseResumeCatchup(t *testing.T) {
97+
func TestF3StopStartCatchup(t *testing.T) {
9898
// Quiet down the logs since the test asserts a scenario that triggers
9999
// OhShitStore ERROR level logs.
100100
_ = logging.SetLogLevel("f3/ohshitstore", "DPANIC")
@@ -104,8 +104,8 @@ func TestF3PauseResumeCatchup(t *testing.T) {
104104
env.requireEpochFinalizedEventually(env.manifest.BootstrapEpoch, eventualCheckTimeout)
105105

106106
// Pausing two nodes should pause the network.
107-
env.pauseNode(1)
108-
env.pauseNode(2)
107+
env.stopNode(1)
108+
env.stopNode(2)
109109

110110
env.requireF3NotRunningEventually(eventualCheckTimeout, nodeMatchers.byID(1, 2))
111111

@@ -118,22 +118,23 @@ func TestF3PauseResumeCatchup(t *testing.T) {
118118
}, eventualCheckTimeout, eventualCheckInterval)
119119

120120
// Resuming node 1 should continue agreeing on instances.
121-
env.resumeNode(1)
121+
env.startNode(1)
122+
env.connectAll()
122123
env.requireF3RunningEventually(eventualCheckTimeout, nodeMatchers.byID(1))
123124

124125
// Wait until we're far enough that pure GPBFT catchup should be impossible.
125126
targetInstance := env.nodes[1].currentGpbftInstance() + env.manifest.CommitteeLookback + 1
126127
env.requireInstanceEventually(targetInstance, eventualCheckTimeout, false)
127128

128-
env.resumeNode(2)
129+
env.startNode(2)
129130
env.requireF3RunningEventually(eventualCheckTimeout, nodeMatchers.byID(2))
130131

131132
// Everyone should catch up eventually
132133
env.requireInstanceEventually(targetInstance, eventualCheckTimeout, true)
133134

134135
// Pause the "good" node.
135136
node0failInstance := env.nodes[0].currentGpbftInstance()
136-
env.pauseNode(0)
137+
env.stopNode(0)
137138
env.requireF3NotRunningEventually(eventualCheckTimeout, nodeMatchers.byID(0))
138139

139140
// We should be able to make progress with the remaining nodes.
@@ -337,12 +338,12 @@ func (n *testNode) init() *f3.F3 {
337338
return n.f3
338339
}
339340

340-
func (n *testNode) pause() {
341-
require.NoError(n.e.t, n.f3.Pause(n.e.testCtx))
341+
func (n *testNode) stop() {
342+
require.NoError(n.e.t, n.f3.Stop(n.e.testCtx))
342343
}
343344

344-
func (n *testNode) resume() {
345-
require.NoError(n.e.t, n.f3.Resume(n.e.testCtx))
345+
func (n *testNode) start() {
346+
require.NoError(n.e.t, n.f3.Start(n.e.testCtx))
346347
}
347348

348349
type testNodeStatus struct {
@@ -629,12 +630,12 @@ func (e *testEnv) withManifest(m manifest.Manifest) *testEnv {
629630
return e
630631
}
631632

632-
func (e *testEnv) pauseNode(i int) {
633-
e.nodes[i].pause()
633+
func (e *testEnv) stopNode(i int) {
634+
e.nodes[i].stop()
634635
}
635636

636-
func (e *testEnv) resumeNode(i int) {
637-
e.nodes[i].resume()
637+
func (e *testEnv) startNode(i int) {
638+
e.nodes[i].start()
638639
}
639640

640641
func (e *testEnv) injectDatastoreFailures(i int, fn func(op string) error) {

0 commit comments

Comments
 (0)