Skip to content

Commit 57d4898

Browse files
authored
p2p/dnsdisc: re-check tree root when leaf resolution fails (#20682)
This adds additional logic to re-resolve the root name of a tree when a couple of leaf requests have failed. We need this change to avoid getting into a failure state where leaf requests keep failing for half an hour when the tree has been updated.
1 parent c211798 commit 57d4898

File tree

2 files changed

+113
-18
lines changed

2 files changed

+113
-18
lines changed

p2p/dnsdisc/client_test.go

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package dnsdisc
1919
import (
2020
"context"
2121
"crypto/ecdsa"
22+
"errors"
2223
"math/rand"
2324
"reflect"
2425
"testing"
@@ -176,11 +177,62 @@ func TestIteratorNodeUpdates(t *testing.T) {
176177
t.Fatal(err)
177178
}
178179

179-
// sync the original tree.
180+
// Sync the original tree.
180181
resolver.add(tree1.ToTXT("n"))
181182
checkIterator(t, it, nodes[:25])
182183

183-
// Update some nodes and ensure RandomNode returns the new nodes as well.
184+
// Ensure RandomNode returns the new nodes after the tree is updated.
185+
updateSomeNodes(nodesSeed1, nodes)
186+
tree2, _ := makeTestTree("n", nodes, nil)
187+
resolver.clear()
188+
resolver.add(tree2.ToTXT("n"))
189+
t.Log("tree updated")
190+
191+
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
192+
checkIterator(t, it, nodes)
193+
}
194+
195+
// This test checks that the tree root is rechecked when a couple of leaf
196+
// requests have failed. The test is just like TestIteratorNodeUpdates, but
197+
// without advancing the clock by recheckInterval after the tree update.
198+
func TestIteratorRootRecheckOnFail(t *testing.T) {
199+
var (
200+
clock = new(mclock.Simulated)
201+
nodes = testNodes(nodesSeed1, 30)
202+
resolver = newMapResolver()
203+
c = NewClient(Config{
204+
Resolver: resolver,
205+
Logger: testlog.Logger(t, log.LvlTrace),
206+
RecheckInterval: 20 * time.Minute,
207+
RateLimit: 500,
208+
// Disabling the cache is required for this test because the client doesn't
209+
// notice leaf failures if all records are cached.
210+
CacheLimit: 1,
211+
})
212+
)
213+
c.clock = clock
214+
tree1, url := makeTestTree("n", nodes[:25], nil)
215+
it, err := c.NewIterator(url)
216+
if err != nil {
217+
t.Fatal(err)
218+
}
219+
220+
// Sync the original tree.
221+
resolver.add(tree1.ToTXT("n"))
222+
checkIterator(t, it, nodes[:25])
223+
224+
// Ensure RandomNode returns the new nodes after the tree is updated.
225+
updateSomeNodes(nodesSeed1, nodes)
226+
tree2, _ := makeTestTree("n", nodes, nil)
227+
resolver.clear()
228+
resolver.add(tree2.ToTXT("n"))
229+
t.Log("tree updated")
230+
231+
checkIterator(t, it, nodes)
232+
}
233+
234+
// updateSomeNodes applies ENR updates to some of the given nodes.
235+
func updateSomeNodes(keySeed int64, nodes []*enode.Node) {
184236
keys := testKeys(nodesSeed1, len(nodes))
185237
for i, n := range nodes[:len(nodes)/2] {
186238
r := n.Record()
@@ -190,11 +242,6 @@ func TestIteratorNodeUpdates(t *testing.T) {
190242
n2, _ := enode.New(enode.ValidSchemes, r)
191243
nodes[i] = n2
192244
}
193-
tree2, _ := makeTestTree("n", nodes, nil)
194-
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
195-
resolver.clear()
196-
resolver.add(tree2.ToTXT("n"))
197-
checkIterator(t, it, nodes)
198245
}
199246

200247
// This test verifies that randomIterator re-checks the root of the tree to catch
@@ -230,9 +277,10 @@ func TestIteratorLinkUpdates(t *testing.T) {
230277
// Add link to tree3, remove link to tree2.
231278
tree1, _ = makeTestTree("t1", nodes[:10], []string{url3})
232279
resolver.add(tree1.ToTXT("t1"))
233-
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
234280
t.Log("tree1 updated")
235281

282+
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
283+
236284
var wantNodes []*enode.Node
237285
wantNodes = append(wantNodes, tree1.Nodes()...)
238286
wantNodes = append(wantNodes, tree3.Nodes()...)
@@ -345,5 +393,5 @@ func (mr mapResolver) LookupTXT(ctx context.Context, name string) ([]string, err
345393
if record, ok := mr[name]; ok {
346394
return []string{record}, nil
347395
}
348-
return nil, nil
396+
return nil, errors.New("not found")
349397
}

p2p/dnsdisc/sync.go

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,22 @@ import (
2525
"github.com/ethereum/go-ethereum/p2p/enode"
2626
)
2727

28+
const (
29+
rootRecheckFailCount = 5 // update root if this many leaf requests fail
30+
)
31+
2832
// clientTree is a full tree being synced.
2933
type clientTree struct {
3034
c *Client
3135
loc *linkEntry // link to this tree
3236

3337
lastRootCheck mclock.AbsTime // last revalidation of root
34-
root *rootEntry
35-
enrs *subtreeSync
36-
links *subtreeSync
38+
leafFailCount int
39+
rootFailCount int
40+
41+
root *rootEntry
42+
enrs *subtreeSync
43+
links *subtreeSync
3744

3845
lc *linkCache // tracks all links between all trees
3946
curLinks map[string]struct{} // links contained in this tree
@@ -46,7 +53,7 @@ func newClientTree(c *Client, lc *linkCache, loc *linkEntry) *clientTree {
4653

4754
// syncAll retrieves all entries of the tree.
4855
func (ct *clientTree) syncAll(dest map[string]entry) error {
49-
if err := ct.updateRoot(); err != nil {
56+
if err := ct.updateRoot(context.Background()); err != nil {
5057
return err
5158
}
5259
if err := ct.links.resolveAll(dest); err != nil {
@@ -60,12 +67,20 @@ func (ct *clientTree) syncAll(dest map[string]entry) error {
6067

6168
// syncRandom retrieves a single entry of the tree. The Node return value
6269
// is non-nil if the entry was a node.
63-
func (ct *clientTree) syncRandom(ctx context.Context) (*enode.Node, error) {
70+
func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error) {
6471
if ct.rootUpdateDue() {
65-
if err := ct.updateRoot(); err != nil {
72+
if err := ct.updateRoot(ctx); err != nil {
6673
return nil, err
6774
}
6875
}
76+
77+
// Update fail counter for leaf request errors.
78+
defer func() {
79+
if err != nil {
80+
ct.leafFailCount++
81+
}
82+
}()
83+
6984
// Link tree sync has priority, run it to completion before syncing ENRs.
7085
if !ct.links.done() {
7186
err := ct.syncNextLink(ctx)
@@ -138,15 +153,22 @@ func removeHash(h []string, index int) []string {
138153
}
139154

140155
// updateRoot ensures that the given tree has an up-to-date root.
141-
func (ct *clientTree) updateRoot() error {
156+
func (ct *clientTree) updateRoot(ctx context.Context) error {
157+
if !ct.slowdownRootUpdate(ctx) {
158+
return ctx.Err()
159+
}
160+
142161
ct.lastRootCheck = ct.c.clock.Now()
143-
ctx, cancel := context.WithTimeout(context.Background(), ct.c.cfg.Timeout)
162+
ctx, cancel := context.WithTimeout(ctx, ct.c.cfg.Timeout)
144163
defer cancel()
145164
root, err := ct.c.resolveRoot(ctx, ct.loc)
146165
if err != nil {
166+
ct.rootFailCount++
147167
return err
148168
}
149169
ct.root = &root
170+
ct.rootFailCount = 0
171+
ct.leafFailCount = 0
150172

151173
// Invalidate subtrees if changed.
152174
if ct.links == nil || root.lroot != ct.links.root {
@@ -161,7 +183,32 @@ func (ct *clientTree) updateRoot() error {
161183

162184
// rootUpdateDue returns true when a root update is needed.
163185
func (ct *clientTree) rootUpdateDue() bool {
164-
return ct.root == nil || time.Duration(ct.c.clock.Now()-ct.lastRootCheck) > ct.c.cfg.RecheckInterval
186+
tooManyFailures := ct.leafFailCount > rootRecheckFailCount
187+
scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval
188+
return ct.root == nil || tooManyFailures || scheduledCheck
189+
}
190+
191+
// slowdownRootUpdate applies a delay to root resolution if is tried
192+
// too frequently. This avoids busy polling when the client is offline.
193+
// Returns true if the timeout passed, false if sync was canceled.
194+
func (ct *clientTree) slowdownRootUpdate(ctx context.Context) bool {
195+
var delay time.Duration
196+
switch {
197+
case ct.rootFailCount > 20:
198+
delay = 10 * time.Second
199+
case ct.rootFailCount > 5:
200+
delay = 5 * time.Second
201+
default:
202+
return true
203+
}
204+
timeout := ct.c.clock.NewTimer(delay)
205+
defer timeout.Stop()
206+
select {
207+
case <-timeout.C():
208+
return true
209+
case <-ctx.Done():
210+
return false
211+
}
165212
}
166213

167214
// subtreeSync is the sync of an ENR or link subtree.

0 commit comments

Comments
 (0)