Skip to content

Commit d1b048a

Browse files
authored
Handle failure to receive/validate a decision (#436)
Previously we'd panic. Now, we "handle" this by stopping. We will automatically restart if/when we receive a _valid_ finality certificate over the certificate exchange.
1 parent d0f5a05 commit d1b048a

File tree

7 files changed

+109
-40
lines changed

7 files changed

+109
-40
lines changed

emulator/host.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ func (h *driverHost) RequestBroadcast(mb *gpbft.MessageBuilder) error {
5050
return nil
5151
}
5252

53-
func (h *driverHost) ReceiveDecision(decision *gpbft.Justification) time.Time {
53+
func (h *driverHost) ReceiveDecision(decision *gpbft.Justification) (time.Time, error) {
5454
require.NoError(h.t, h.maybeReceiveDecision(decision))
55-
return h.now
55+
return h.now, nil
5656
}
5757

5858
func (h *driverHost) maybeReceiveDecision(decision *gpbft.Justification) error {

gpbft/api.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ type DecisionReceiver interface {
123123
// The notification must return the timestamp at which the next instance should begin,
124124
// based on the decision received (which may be in the past).
125125
// E.g. this might be: finalised tipset timestamp + epoch duration + stabilisation delay.
126-
ReceiveDecision(decision *Justification) time.Time
126+
ReceiveDecision(decision *Justification) (time.Time, error)
127127
}
128128

129129
// Tracer collects trace logs that capture logical state changes.

gpbft/mock_host_test.go

Lines changed: 15 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gpbft/participant.go

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ type Participant struct {
3434
gpbft *instance
3535
// Messages queued for future instances.
3636
mqueue *messageQueue
37-
// The output from the last terminated Granite instance.
38-
finalised *Justification
3937
// The round number during which the last instance was terminated.
4038
// This is for informational purposes only. It does not necessarily correspond to the
4139
// protocol round for which a strong quorum of COMMIT messages was observed,
@@ -95,7 +93,8 @@ func (p *Participant) StartInstanceAt(instance uint64, when time.Time) (err erro
9593

9694
// Finish current instance to clean old committees and old messages queued
9795
// and prepare to begin a new instance.
98-
p.finishCurrentInstance(instance)
96+
_ = p.finishCurrentInstance()
97+
p.beginNextInstance(instance)
9998

10099
// Set the alarm to begin a new instance at the specified time.
101100
p.host.SetAlarm(when)
@@ -266,21 +265,31 @@ func (p *Participant) fetchCommittee(instance uint64) (*committee, error) {
266265
}
267266

268267
func (p *Participant) handleDecision() {
269-
if p.terminated() {
270-
p.finishCurrentInstance(p.currentInstance + 1)
271-
nextStart := p.host.ReceiveDecision(p.finalised)
272-
// Set an alarm at which to fetch the next chain and begin a new instance.
268+
if !p.terminated() {
269+
return
270+
}
271+
decision := p.finishCurrentInstance()
272+
nextStart, err := p.host.ReceiveDecision(decision)
273+
if err != nil {
274+
p.tracer.Log("failed to receive decision: %+v", err)
275+
p.host.SetAlarm(time.Time{})
276+
} else {
277+
p.beginNextInstance(p.currentInstance + 1)
273278
p.host.SetAlarm(nextStart)
274279
}
275280
}
276281

277-
func (p *Participant) finishCurrentInstance(nextInstance uint64) {
282+
func (p *Participant) finishCurrentInstance() *Justification {
283+
var decision *Justification
278284
if p.gpbft != nil {
279-
p.finalised = p.gpbft.terminationValue
285+
decision = p.gpbft.terminationValue
280286
p.terminatedDuringRound = p.gpbft.round
281287
}
282288
p.gpbft = nil
289+
return decision
290+
}
283291

292+
func (p *Participant) beginNextInstance(nextInstance uint64) {
284293
p.instanceMutex.Lock()
285294
defer p.instanceMutex.Unlock()
286295
// Clean all messages queued and old committees for instances below the next one.

host.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bytes"
55
"context"
66
"errors"
7+
"fmt"
78
"slices"
89
"time"
910

@@ -557,7 +558,16 @@ func (h *gpbftHost) SetAlarm(at time.Time) {
557558
log.Debugf("set alarm for %v", at)
558559
// we cannot reuse the timer because we don't know if it was read or not
559560
h.alertTimer.Stop()
560-
h.alertTimer = time.NewTimer(time.Until(at))
561+
if at.IsZero() {
562+
// It "at" is zero, we cancel the timer entirely. Unfortunately, we still have to
563+
// replace it for the reason stated above.
564+
h.alertTimer = time.NewTimer(0)
565+
if !h.alertTimer.Stop() {
566+
<-h.alertTimer.C
567+
}
568+
} else {
569+
h.alertTimer = time.NewTimer(max(0, time.Until(at)))
570+
}
561571
}
562572

563573
// Receives a finality decision from the instance, with signatures from a strong quorum
@@ -566,14 +576,16 @@ func (h *gpbftHost) SetAlarm(at time.Time) {
566576
// The notification must return the timestamp at which the next instance should begin,
567577
// based on the decision received (which may be in the past).
568578
// E.g. this might be: finalised tipset timestamp + epoch duration + stabilisation delay.
569-
func (h *gpbftHost) ReceiveDecision(decision *gpbft.Justification) time.Time {
579+
func (h *gpbftHost) ReceiveDecision(decision *gpbft.Justification) (time.Time, error) {
570580
log.Infof("got decision at instance %d, finalized head at epoch: %d",
571581
decision.Vote.Instance, decision.Vote.Value.Head().Epoch)
572582
cert, err := h.saveDecision(decision)
573583
if err != nil {
574-
log.Errorf("error while saving decision: %+v", err)
584+
err := fmt.Errorf("error while saving decision: %+v", err)
585+
log.Error(err)
586+
return time.Time{}, err
575587
}
576-
return (*gpbftRunner)(h).computeNextInstanceStart(cert)
588+
return (*gpbftRunner)(h).computeNextInstanceStart(cert), nil
577589
}
578590

579591
func (h *gpbftHost) saveDecision(decision *gpbft.Justification) (*certs.FinalityCertificate, error) {

sim/host.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,10 @@ func (v *simHost) Time() time.Time {
100100
return v.sim.network.Time()
101101
}
102102

103-
func (v *simHost) ReceiveDecision(decision *gpbft.Justification) time.Time {
103+
func (v *simHost) ReceiveDecision(decision *gpbft.Justification) (time.Time, error) {
104104
v.sim.ec.NotifyDecision(v.id, decision)
105105
v.ecChain = decision.Vote.Value
106-
return v.Time().Add(v.sim.ecEpochDuration).Add(v.sim.ecStabilisationDelay)
106+
return v.Time().Add(v.sim.ecEpochDuration).Add(v.sim.ecStabilisationDelay), nil
107107
}
108108

109109
func (v *simHost) StoragePower(instance uint64) *gpbft.StoragePower {

test/f3_test.go

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import (
44
"context"
55
"fmt"
66
"math/big"
7-
"os"
7+
"sync/atomic"
88
"testing"
99
"time"
1010

@@ -14,7 +14,10 @@ import (
1414
"github.com/filecoin-project/go-f3/gpbft"
1515
"github.com/filecoin-project/go-f3/manifest"
1616
"github.com/filecoin-project/go-f3/sim/signing"
17-
leveldb "github.com/ipfs/go-ds-leveldb"
17+
18+
"github.com/ipfs/go-datastore"
19+
"github.com/ipfs/go-datastore/failstore"
20+
ds_sync "github.com/ipfs/go-datastore/sync"
1821
pubsub "github.com/libp2p/go-libp2p-pubsub"
1922
"github.com/libp2p/go-libp2p/core/host"
2023
"github.com/libp2p/go-libp2p/core/peer"
@@ -79,6 +82,37 @@ func TestPauseResumeCatchup(t *testing.T) {
7982
env.waitForInstanceNumber(node0failInstance+3, 30*time.Second, false)
8083
}
8184

85+
func TestFailRecover(t *testing.T) {
86+
env := newTestEnvironment(t, 2, false)
87+
88+
// Make it possible to fail a single write for node 0.
89+
var failDsWrite atomic.Bool
90+
dsFailureFunc := func(op string) error {
91+
if failDsWrite.Load() {
92+
switch op {
93+
case "put", "batch-put":
94+
failDsWrite.Store(false)
95+
return xerrors.Errorf("FAILURE!")
96+
}
97+
}
98+
return nil
99+
}
100+
101+
env.injectDatastoreFailures(0, dsFailureFunc)
102+
103+
env.connectAll()
104+
env.start()
105+
env.waitForInstanceNumber(1, 10*time.Second, true)
106+
107+
// Inject a single write failure. This should prevent us from storing a single decision
108+
// decision.
109+
failDsWrite.Store(true)
110+
111+
// We should proceed anyways (catching up via the certificate exchange protocol).
112+
oldInstance := env.nodes[0].currentGpbftInstance()
113+
env.waitForInstanceNumber(oldInstance+3, 10*time.Second, true)
114+
}
115+
82116
func TestDynamicManifest_WithoutChanges(t *testing.T) {
83117
env := newTestEnvironment(t, 2, true)
84118

@@ -182,9 +216,10 @@ var base manifest.Manifest = manifest.Manifest{
182216
}
183217

184218
type testNode struct {
185-
e *testEnv
186-
h host.Host
187-
f3 *f3.F3
219+
e *testEnv
220+
h host.Host
221+
f3 *f3.F3
222+
dsErrFunc func(string) error
188223
}
189224

190225
func (n *testNode) currentGpbftInstance() uint64 {
@@ -486,20 +521,19 @@ func (e *testEnv) newF3Instance(id int, manifestServer peer.ID) (*testNode, erro
486521
return nil, xerrors.Errorf("creating libp2p host: %w", err)
487522
}
488523

524+
n := &testNode{e: e, h: h}
525+
489526
ps, err := pubsub.NewGossipSub(e.testCtx, h)
490527
if err != nil {
491528
return nil, xerrors.Errorf("creating gossipsub: %w", err)
492529
}
493530

494-
tmpdir, err := os.MkdirTemp("", "f3-*")
495-
if err != nil {
496-
return nil, xerrors.Errorf("creating temp dir: %w", err)
497-
}
498-
499-
ds, err := leveldb.NewDatastore(tmpdir, nil)
500-
if err != nil {
501-
return nil, xerrors.Errorf("creating a datastore: %w", err)
502-
}
531+
ds := ds_sync.MutexWrap(failstore.NewFailstore(datastore.NewMapDatastore(), func(s string) error {
532+
if n.dsErrFunc != nil {
533+
return (n.dsErrFunc)(s)
534+
}
535+
return nil
536+
}))
503537

504538
m := e.manifest // copy because we mutate this
505539
var mprovider manifest.ManifestProvider
@@ -511,16 +545,20 @@ func (e *testEnv) newF3Instance(id int, manifestServer peer.ID) (*testNode, erro
511545

512546
e.signingBackend.Allow(int(id))
513547

514-
module, err := f3.New(e.testCtx, mprovider, ds, h, ps, e.signingBackend, e.ec)
548+
n.f3, err = f3.New(e.testCtx, mprovider, ds, h, ps, e.signingBackend, e.ec)
515549
if err != nil {
516550
return nil, xerrors.Errorf("creating module: %w", err)
517551
}
518552

519553
e.errgrp.Go(func() error {
520-
return runMessageSubscription(e.testCtx, module, gpbft.ActorID(id), e.signingBackend)
554+
return runMessageSubscription(e.testCtx, n.f3, gpbft.ActorID(id), e.signingBackend)
521555
})
522556

523-
return &testNode{e: e, h: h, f3: module}, nil
557+
return n, nil
558+
}
559+
560+
func (e *testEnv) injectDatastoreFailures(i int, fn func(op string) error) {
561+
e.nodes[i].dsErrFunc = fn
524562
}
525563

526564
// TODO: This code is copy-pasta from cmd/f3/run.go, consider taking it out into a shared testing lib.

0 commit comments

Comments
 (0)