Skip to content

Commit 804d45c

Browse files
0xVasconcelosfjl
andauthored
p2p: DNS resolution for static nodes (#30822)
Closes #23210 # Context When deploying Geth in Kubernetes with ReplicaSets, we encountered two DNS-related issues affecting node connectivity. First, during startup, Geth tries to resolve DNS names for static nodes too early in the config unmarshaling phase. If peer nodes aren't ready yet (which is common in Kubernetes rolling deployments), this causes an immediate failure: ``` INFO [11-26|10:03:42.816] Starting Geth on Ethereum mainnet... INFO [11-26|10:03:42.817] Bumping default cache on mainnet provided=1024 updated=4096 Fatal: config.toml, line 81: (p2p.Config.StaticNodes) lookup idontexist.geth.node: no such host ``` The second issue comes up when pods get rescheduled to different nodes - their IPs change but peers keep using the initially resolved IP, never updating the DNS mapping. This PR adds proper DNS support for enode:// URLs by deferring resolution to connection time. It also handles DNS failures gracefully instead of failing fatally during startup, making it work better in container environments where IPs are dynamic and peers come and go during rollouts. --------- Co-authored-by: Felix Lange <[email protected]>
1 parent 88cbfab commit 804d45c

File tree

7 files changed

+228
-76
lines changed

7 files changed

+228
-76
lines changed

p2p/dial.go

Lines changed: 114 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,15 @@ import (
2424
"fmt"
2525
mrand "math/rand"
2626
"net"
27+
"net/netip"
2728
"sync"
2829
"sync/atomic"
2930
"time"
3031

3132
"github.com/ethereum/go-ethereum/common/mclock"
3233
"github.com/ethereum/go-ethereum/log"
3334
"github.com/ethereum/go-ethereum/p2p/enode"
35+
"github.com/ethereum/go-ethereum/p2p/enr"
3436
"github.com/ethereum/go-ethereum/p2p/netutil"
3537
)
3638

@@ -77,6 +79,7 @@ var (
7779
errRecentlyDialed = errors.New("recently dialed")
7880
errNetRestrict = errors.New("not contained in netrestrict list")
7981
errNoPort = errors.New("node does not provide TCP port")
82+
errNoResolvedIP = errors.New("node does not provide a resolved IP")
8083
)
8184

8285
// dialer creates outbound connections and submits them into Server.
@@ -90,16 +93,17 @@ var (
9093
// to create peer connections to nodes arriving through the iterator.
9194
type dialScheduler struct {
9295
dialConfig
93-
setupFunc dialSetupFunc
94-
wg sync.WaitGroup
95-
cancel context.CancelFunc
96-
ctx context.Context
97-
nodesIn chan *enode.Node
98-
doneCh chan *dialTask
99-
addStaticCh chan *enode.Node
100-
remStaticCh chan *enode.Node
101-
addPeerCh chan *conn
102-
remPeerCh chan *conn
96+
setupFunc dialSetupFunc
97+
dnsLookupFunc func(ctx context.Context, network string, name string) ([]netip.Addr, error)
98+
wg sync.WaitGroup
99+
cancel context.CancelFunc
100+
ctx context.Context
101+
nodesIn chan *enode.Node
102+
doneCh chan *dialTask
103+
addStaticCh chan *enode.Node
104+
remStaticCh chan *enode.Node
105+
addPeerCh chan *conn
106+
remPeerCh chan *conn
103107

104108
// Everything below here belongs to loop and
105109
// should only be accessed by code on the loop goroutine.
@@ -159,18 +163,19 @@ func (cfg dialConfig) withDefaults() dialConfig {
159163
func newDialScheduler(config dialConfig, it enode.Iterator, setupFunc dialSetupFunc) *dialScheduler {
160164
cfg := config.withDefaults()
161165
d := &dialScheduler{
162-
dialConfig: cfg,
163-
historyTimer: mclock.NewAlarm(cfg.clock),
164-
setupFunc: setupFunc,
165-
dialing: make(map[enode.ID]*dialTask),
166-
static: make(map[enode.ID]*dialTask),
167-
peers: make(map[enode.ID]struct{}),
168-
doneCh: make(chan *dialTask),
169-
nodesIn: make(chan *enode.Node),
170-
addStaticCh: make(chan *enode.Node),
171-
remStaticCh: make(chan *enode.Node),
172-
addPeerCh: make(chan *conn),
173-
remPeerCh: make(chan *conn),
166+
dialConfig: cfg,
167+
historyTimer: mclock.NewAlarm(cfg.clock),
168+
setupFunc: setupFunc,
169+
dnsLookupFunc: net.DefaultResolver.LookupNetIP,
170+
dialing: make(map[enode.ID]*dialTask),
171+
static: make(map[enode.ID]*dialTask),
172+
peers: make(map[enode.ID]struct{}),
173+
doneCh: make(chan *dialTask),
174+
nodesIn: make(chan *enode.Node),
175+
addStaticCh: make(chan *enode.Node),
176+
remStaticCh: make(chan *enode.Node),
177+
addPeerCh: make(chan *conn),
178+
remPeerCh: make(chan *conn),
174179
}
175180
d.lastStatsLog = d.clock.Now()
176181
d.ctx, d.cancel = context.WithCancel(context.Background())
@@ -274,7 +279,7 @@ loop:
274279
case node := <-d.addStaticCh:
275280
id := node.ID()
276281
_, exists := d.static[id]
277-
d.log.Trace("Adding static node", "id", id, "ip", node.IPAddr(), "added", !exists)
282+
d.log.Trace("Adding static node", "id", id, "endpoint", nodeEndpointForLog(node), "added", !exists)
278283
if exists {
279284
continue loop
280285
}
@@ -433,10 +438,68 @@ func (d *dialScheduler) removeFromStaticPool(idx int) {
433438
task.staticPoolIndex = -1
434439
}
435440

441+
// dnsResolveHostname updates the given node from its DNS hostname.
442+
// This is used to resolve static dial targets.
443+
func (d *dialScheduler) dnsResolveHostname(n *enode.Node) (*enode.Node, error) {
444+
if n.Hostname() == "" {
445+
return n, nil
446+
}
447+
448+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
449+
defer cancel()
450+
foundIPs, err := d.dnsLookupFunc(ctx, "ip", n.Hostname())
451+
if err != nil {
452+
return n, err
453+
}
454+
455+
// Check for IP updates.
456+
var (
457+
nodeIP4, nodeIP6 netip.Addr
458+
foundIP4, foundIP6 netip.Addr
459+
)
460+
n.Load((*enr.IPv4Addr)(&nodeIP4))
461+
n.Load((*enr.IPv6Addr)(&nodeIP6))
462+
for _, ip := range foundIPs {
463+
if ip.Is4() && !foundIP4.IsValid() {
464+
foundIP4 = ip
465+
}
466+
if ip.Is6() && !foundIP6.IsValid() {
467+
foundIP6 = ip
468+
}
469+
}
470+
471+
if !foundIP4.IsValid() && !foundIP6.IsValid() {
472+
// Lookup failed.
473+
return n, errNoResolvedIP
474+
}
475+
if foundIP4 == nodeIP4 && foundIP6 == nodeIP6 {
476+
// No updates necessary.
477+
d.log.Trace("Node DNS lookup had no update", "id", n.ID(), "name", n.Hostname(), "ip", foundIP4, "ip6", foundIP6)
478+
return n, nil
479+
}
480+
481+
// Update the node. Note this invalidates the ENR signature, because we use SignNull
482+
// to create a modified copy. But this should be OK, since we just use the node as a
483+
// dial target. And nodes will usually only have a DNS hostname if they came from a
484+
// enode:// URL, which has no signature anyway. If it ever becomes a problem, the
485+
// resolved IP could also be stored into dialTask instead of the node.
486+
rec := n.Record()
487+
if foundIP4.IsValid() {
488+
rec.Set(enr.IPv4Addr(foundIP4))
489+
}
490+
if foundIP6.IsValid() {
491+
rec.Set(enr.IPv6Addr(foundIP6))
492+
}
493+
rec.SetSeq(n.Seq()) // ensure seq not bumped by update
494+
newNode := enode.SignNull(rec, n.ID()).WithHostname(n.Hostname())
495+
d.log.Debug("Node updated from DNS lookup", "id", n.ID(), "name", n.Hostname(), "ip", newNode.IP())
496+
return newNode, nil
497+
}
498+
436499
// startDial runs the given dial task in a separate goroutine.
437500
func (d *dialScheduler) startDial(task *dialTask) {
438501
node := task.dest()
439-
d.log.Trace("Starting p2p dial", "id", node.ID(), "ip", node.IPAddr(), "flag", task.flags)
502+
d.log.Trace("Starting p2p dial", "id", node.ID(), "endpoint", nodeEndpointForLog(node), "flag", task.flags)
440503
hkey := string(node.ID().Bytes())
441504
d.history.add(hkey, d.clock.Now().Add(dialHistoryExpiration))
442505
d.dialing[node.ID()] = task
@@ -473,23 +536,38 @@ func (t *dialTask) dest() *enode.Node {
473536
}
474537

475538
func (t *dialTask) run(d *dialScheduler) {
476-
if t.needResolve() && !t.resolve(d) {
477-
return
539+
if t.isStatic() {
540+
// Resolve DNS.
541+
if n := t.dest(); n.Hostname() != "" {
542+
resolved, err := d.dnsResolveHostname(n)
543+
if err != nil {
544+
d.log.Warn("DNS lookup of static node failed", "id", n.ID(), "name", n.Hostname(), "err", err)
545+
} else {
546+
t.destPtr.Store(resolved)
547+
}
548+
}
549+
// Try resolving node ID through the DHT if there is no IP address.
550+
if !t.dest().IPAddr().IsValid() {
551+
if !t.resolve(d) {
552+
return // DHT resolve failed, skip dial.
553+
}
554+
}
478555
}
479556

480557
err := t.dial(d, t.dest())
481558
if err != nil {
482559
// For static nodes, resolve one more time if dialing fails.
483-
if _, ok := err.(*dialError); ok && t.flags&staticDialedConn != 0 {
560+
var dialErr *dialError
561+
if errors.As(err, &dialErr) && t.isStatic() {
484562
if t.resolve(d) {
485563
t.dial(d, t.dest())
486564
}
487565
}
488566
}
489567
}
490568

491-
func (t *dialTask) needResolve() bool {
492-
return t.flags&staticDialedConn != 0 && !t.dest().IPAddr().IsValid()
569+
func (t *dialTask) isStatic() bool {
570+
return t.flags&staticDialedConn != 0
493571
}
494572

495573
// resolve attempts to find the current endpoint for the destination
@@ -553,3 +631,10 @@ func cleanupDialErr(err error) error {
553631
}
554632
return err
555633
}
634+
635+
func nodeEndpointForLog(n *enode.Node) string {
636+
if n.Hostname() != "" {
637+
return n.Hostname()
638+
}
639+
return n.IPAddr().String()
640+
}

p2p/dial_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"math/rand"
2424
"net"
25+
"net/netip"
2526
"reflect"
2627
"sync"
2728
"testing"
@@ -394,6 +395,34 @@ func TestDialSchedResolve(t *testing.T) {
394395
})
395396
}
396397

398+
func TestDialSchedDNSHostname(t *testing.T) {
399+
t.Parallel()
400+
401+
config := dialConfig{
402+
maxActiveDials: 1,
403+
maxDialPeers: 1,
404+
}
405+
node := newNode(uintID(0x01), ":30303").WithHostname("node-hostname")
406+
resolved := newNode(uintID(0x01), "1.2.3.4:30303").WithHostname("node-hostname")
407+
runDialTest(t, config, []dialTestRound{
408+
{
409+
update: func(d *dialScheduler) {
410+
d.dnsLookupFunc = func(ctx context.Context, network string, name string) ([]netip.Addr, error) {
411+
if name != "node-hostname" {
412+
t.Error("wrong hostname in DNS lookup:", name)
413+
}
414+
result := []netip.Addr{netip.MustParseAddr("1.2.3.4")}
415+
return result, nil
416+
}
417+
d.addStatic(node)
418+
},
419+
wantNewDials: []*enode.Node{
420+
resolved,
421+
},
422+
},
423+
})
424+
}
425+
397426
// -------
398427
// Code below here is the framework for the tests above.
399428

p2p/enode/node.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ var errMissingPrefix = errors.New("missing 'enr:' prefix for base64-encoded reco
3737
type Node struct {
3838
r enr.Record
3939
id ID
40+
41+
// hostname tracks the DNS name of the node.
42+
hostname string
43+
4044
// endpoint information
4145
ip netip.Addr
4246
udp uint16
@@ -77,6 +81,8 @@ func newNodeWithID(r *enr.Record, id ID) *Node {
7781
n.setIP4(ip4)
7882
case valid6:
7983
n.setIP6(ip6)
84+
default:
85+
n.setIPv4Ports()
8086
}
8187
return n
8288
}
@@ -103,6 +109,10 @@ func localityScore(ip netip.Addr) int {
103109

104110
func (n *Node) setIP4(ip netip.Addr) {
105111
n.ip = ip
112+
n.setIPv4Ports()
113+
}
114+
115+
func (n *Node) setIPv4Ports() {
106116
n.Load((*enr.UDP)(&n.udp))
107117
n.Load((*enr.TCP)(&n.tcp))
108118
}
@@ -184,6 +194,18 @@ func (n *Node) TCP() int {
184194
return int(n.tcp)
185195
}
186196

197+
// WithHostname adds a DNS hostname to the node.
198+
func (n *Node) WithHostname(hostname string) *Node {
199+
cpy := *n
200+
cpy.hostname = hostname
201+
return &cpy
202+
}
203+
204+
// Hostname returns the DNS name assigned by WithHostname.
205+
func (n *Node) Hostname() string {
206+
return n.hostname
207+
}
208+
187209
// UDPEndpoint returns the announced UDP endpoint.
188210
func (n *Node) UDPEndpoint() (netip.AddrPort, bool) {
189211
if !n.ip.IsValid() || n.ip.IsUnspecified() || n.udp == 0 {

p2p/enode/node_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ func TestNodeEndpoints(t *testing.T) {
7474
wantUDP int
7575
wantTCP int
7676
wantQUIC int
77+
wantDNS string
7778
}
7879
tests := []endpointTest{
7980
{
@@ -90,6 +91,7 @@ func TestNodeEndpoints(t *testing.T) {
9091
r.Set(enr.UDP(9000))
9192
return SignNull(&r, id)
9293
}(),
94+
wantUDP: 9000,
9395
},
9496
{
9597
name: "tcp-only",
@@ -98,6 +100,7 @@ func TestNodeEndpoints(t *testing.T) {
98100
r.Set(enr.TCP(9000))
99101
return SignNull(&r, id)
100102
}(),
103+
wantTCP: 9000,
101104
},
102105
{
103106
name: "quic-only",
@@ -268,6 +271,19 @@ func TestNodeEndpoints(t *testing.T) {
268271
wantIP: netip.MustParseAddr("2001::ff00:0042:8329"),
269272
wantQUIC: 9001,
270273
},
274+
{
275+
name: "dns-only",
276+
node: func() *Node {
277+
var r enr.Record
278+
r.Set(enr.UDP(30303))
279+
r.Set(enr.TCP(30303))
280+
n := SignNull(&r, id).WithHostname("example.com")
281+
return n
282+
}(),
283+
wantTCP: 30303,
284+
wantUDP: 30303,
285+
wantDNS: "example.com",
286+
},
271287
}
272288

273289
for _, test := range tests {
@@ -284,6 +300,9 @@ func TestNodeEndpoints(t *testing.T) {
284300
if quic, _ := test.node.QUICEndpoint(); test.wantQUIC != int(quic.Port()) {
285301
t.Errorf("node has wrong QUIC port %d, want %d", quic.Port(), test.wantQUIC)
286302
}
303+
if test.wantDNS != test.node.Hostname() {
304+
t.Errorf("node has wrong DNS name %s, want %s", test.node.Hostname(), test.wantDNS)
305+
}
287306
})
288307
}
289308
}

0 commit comments

Comments
 (0)