55 "errors"
66 "fmt"
77 "net/netip"
8+ "strings"
89 "sync"
910 "sync/atomic"
1011 "time"
@@ -26,6 +27,21 @@ const MappingDuration = time.Minute
2627// CacheTime is the time a mapping will cache an external address for
2728const CacheTime = 15 * time .Second
2829
30+ // DiscoveryTimeout is the maximum time to wait for NAT discovery.
31+ // This is based on the underlying UPnP and NAT-PMP/PCP protocols:
32+ // - SSDP (UPnP discovery) waits 5 seconds for responses
33+ // - NAT-PMP uses exponential backoff starting at 250ms, up to 9 retries
34+ // (total ~32 seconds if exhausted, but typically responds in 1-2 seconds)
35+ // - PCP follows similar timing to NAT-PMP
36+ // - 10 seconds covers common cases while failing fast when no NAT exists
37+ const DiscoveryTimeout = 10 * time .Second
38+
39+ // rediscoveryThreshold is the number of consecutive connection failures
40+ // before triggering NAT rediscovery. We ignore first few failures to
41+ // distinguish between transient network issues and persistent router
42+ // problems like restarts or port changes that require finding the NAT device again.
43+ const rediscoveryThreshold = 3
44+
2945type entry struct {
3046 protocol string
3147 port int
@@ -40,18 +56,15 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
4056 if err != nil {
4157 return nil , err
4258 }
43- var extAddr netip.Addr
44- extIP , err := natInstance .GetExternalAddress ()
45- if err == nil {
46- extAddr , _ = netip .AddrFromSlice (extIP )
47- }
59+
60+ extAddr := getExternalAddress (natInstance )
4861
4962 // Log the device addr.
5063 addr , err := natInstance .GetDeviceAddress ()
5164 if err != nil {
52- log .Debug ("DiscoverGateway address error" , "err" , err )
65+ log .Warn ("DiscoverGateway address error" , "err" , err )
5366 } else {
54- log .Debug ("DiscoverGateway address" , "address" , addr )
67+ log .Info ("DiscoverGateway address" , "address" , addr )
5568 }
5669
5770 ctx , cancel := context .WithCancel (context .Background ())
@@ -74,19 +87,35 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
7487// NATs (Network Address Translators). It is a long-running
7588// service that will periodically renew port mappings,
7689// and keep an up-to-date list of all the external addresses.
90+ //
91+ // Locking strategy:
92+ // - natmu: Protects nat instance and rediscovery state (nat, consecutiveFailures, rediscovering)
93+ // - mappingmu: Protects port mappings table and closed flag (mappings, closed)
94+ // - Lock ordering: When both locks are needed, always acquire mappingmu before natmu
95+ // to prevent deadlocks
96+ // - We use separate mutexes because the NAT instance may change (e.g., when router
97+ // restarts and UPnP port changes), but the port mappings must persist and be
98+ // re-applied across all instances. This separation allows the mappings table to
99+ // remain stable while the underlying NAT device changes.
77100type NAT struct {
78101 natmu sync.Mutex
79102 nat nat.NAT
103+
104+ // Track connection failures for auto-rediscovery
105+ consecutiveFailures int // protected by natmu
106+ rediscovering bool // protected by natmu
107+
80108 // External IP of the NAT. Will be renewed periodically (every CacheTime).
81109 extAddr atomic.Pointer [netip.Addr ]
82110
83111 refCount sync.WaitGroup
84112 ctx context.Context
85113 ctxCancel context.CancelFunc
86114
87- mappingmu sync.RWMutex // guards mappings
88- closed bool
89- mappings map [entry ]int
115+ // Port mappings that should persist across NAT instance changes
116+ mappingmu sync.RWMutex
117+ closed bool // protected by mappingmu
118+ mappings map [entry ]int // protected by mappingmu
90119}
91120
92121// Close shuts down all port mappings. NAT can no longer be used.
@@ -149,11 +178,14 @@ func (nat *NAT) AddMapping(ctx context.Context, protocol string, port int) error
149178func (nat * NAT ) RemoveMapping (ctx context.Context , protocol string , port int ) error {
150179 nat .mappingmu .Lock ()
151180 defer nat .mappingmu .Unlock ()
181+ nat .natmu .Lock ()
182+ defer nat .natmu .Unlock ()
152183
153184 switch protocol {
154185 case "tcp" , "udp" :
155186 e := entry {protocol : protocol , port : port }
156187 if _ , ok := nat .mappings [e ]; ok {
188+ log .Info ("Stopping maintenance of port mapping" , "protocol" , protocol , "port" , port )
157189 delete (nat .mappings , e )
158190 return nat .nat .DeletePortMapping (ctx , protocol , port )
159191 }
@@ -164,6 +196,12 @@ func (nat *NAT) RemoveMapping(ctx context.Context, protocol string, port int) er
164196}
165197
166198func (nat * NAT ) background () {
199+ // Renew port mappings every 20 seconds (1/3 of 60s lifetime).
200+ // - NAT-PMP RFC 6886 recommends renewing at 50% of lifetime
201+ // - We use 33% for added safety against silent lifetime reductions
202+ // NOTE: This aggressive 60s/20s pattern may be outdated for modern routers
203+ // but provides quick cleanup and fast failure detection for our rediscovery.
204+ // TODO: Research longer durations (e.g. 30min/10min) to reduce router load
167205 const mappingUpdate = MappingDuration / 3
168206
169207 now := time .Now ()
@@ -202,8 +240,11 @@ func (nat *NAT) background() {
202240 nextMappingUpdate = time .Now ().Add (mappingUpdate )
203241 }
204242 if now .After (nextAddrUpdate ) {
205- var extAddr netip. Addr
243+ nat . natmu . Lock ()
206244 extIP , err := nat .nat .GetExternalAddress ()
245+ nat .natmu .Unlock ()
246+
247+ var extAddr netip.Addr
207248 if err == nil {
208249 extAddr , _ = netip .AddrFromSlice (extIP )
209250 }
@@ -213,13 +254,16 @@ func (nat *NAT) background() {
213254 t .Reset (time .Until (minTime (nextAddrUpdate , nextMappingUpdate )))
214255 case <- nat .ctx .Done ():
215256 nat .mappingmu .Lock ()
257+ defer nat .mappingmu .Unlock ()
258+ nat .natmu .Lock ()
259+ defer nat .natmu .Unlock ()
260+
216261 ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
217262 defer cancel ()
218263 for e := range nat .mappings {
219- delete (nat .mappings , e )
220264 nat .nat .DeletePortMapping (ctx , e .protocol , e .port )
221265 }
222- nat .mappingmu . Unlock ( )
266+ clear ( nat .mappings )
223267 return
224268 }
225269 }
@@ -229,28 +273,95 @@ func (nat *NAT) establishMapping(ctx context.Context, protocol string, internalP
229273 log .Debug ("Attempting port map" , "protocol" , protocol , "internal_port" , internalPort )
230274 const comment = "libp2p"
231275
276+ // Try to establish the mapping with both NAT calls under the same lock
232277 nat .natmu .Lock ()
278+ defer nat .natmu .Unlock ()
279+
233280 var err error
234281 externalPort , err = nat .nat .AddPortMapping (ctx , protocol , internalPort , comment , MappingDuration )
235282 if err != nil {
236283 // Some hardware does not support mappings with timeout, so try that
237284 externalPort , err = nat .nat .AddPortMapping (ctx , protocol , internalPort , comment , 0 )
238285 }
239- nat .natmu .Unlock ()
240286
241- if err != nil || externalPort == 0 {
242- if err != nil {
243- log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "err" , err )
287+ // Handle success
288+ if err == nil && externalPort != 0 {
289+ nat .consecutiveFailures = 0
290+ log .Debug ("NAT port mapping established" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , externalPort )
291+ return externalPort
292+ }
293+
294+ // Handle failures
295+ if err != nil {
296+ log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "err" , err )
297+
298+ // Check if this is a connection error that might indicate router restart
299+ // See: https://github.com/libp2p/go-libp2p/issues/3224#issuecomment-2866844723
300+ // Note: We use string matching because goupnp doesn't preserve error chains (uses %v instead of %w)
301+ if strings .Contains (err .Error (), "connection refused" ) {
302+ nat .consecutiveFailures ++
303+ if nat .consecutiveFailures >= rediscoveryThreshold && ! nat .rediscovering {
304+ nat .rediscovering = true
305+ // Spawn in goroutine to avoid blocking the caller while we
306+ // perform network discovery, which can take up to 30 seconds.
307+ // The rediscovering flag prevents multiple concurrent attempts.
308+ go nat .rediscoverNAT ()
309+ }
244310 } else {
245- log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , 0 )
311+ // Reset counter for non-connection errors (transient failures)
312+ nat .consecutiveFailures = 0
246313 }
247- // we do not close if the mapping failed,
248- // because it may work again next time.
249314 return 0
250315 }
251316
252- log .Debug ("NAT Mapping" , "external_port" , externalPort , "internal_port" , internalPort , "protocol" , protocol )
253- return externalPort
317+ // externalPort is 0 but no error was returned
318+ log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , 0 )
319+ return 0
320+ }
321+
322+ // rediscoverNAT attempts to rediscover the NAT device after connection failures
323+ func (nat * NAT ) rediscoverNAT () {
324+ log .Info ("NAT rediscovery triggered due to repeated connection failures" )
325+
326+ ctx , cancel := context .WithTimeout (nat .ctx , DiscoveryTimeout )
327+ defer cancel ()
328+
329+ newNATInstance , err := discoverGateway (ctx )
330+ if err != nil {
331+ log .Warn ("NAT rediscovery failed" , "err" , err )
332+ nat .natmu .Lock ()
333+ defer nat .natmu .Unlock ()
334+ nat .rediscovering = false
335+ return
336+ }
337+
338+ extAddr := getExternalAddress (newNATInstance )
339+
340+ // Replace the NAT instance
341+ // No cleanup of the old instance needed because:
342+ // - Router restart has already wiped all mappings
343+ // - Old UPnP endpoint is dead (connection refused)
344+ // - If router didn't actually restart (false positive), any stale mappings
345+ // on the router expire naturally (60 second UPnP timeout)
346+ nat .natmu .Lock ()
347+ nat .nat = newNATInstance
348+ nat .extAddr .Store (& extAddr )
349+ nat .consecutiveFailures = 0
350+ nat .rediscovering = false
351+ nat .natmu .Unlock ()
352+
353+ // Re-establish all existing mappings on the new NAT instance
354+ nat .mappingmu .Lock ()
355+ for e := range nat .mappings {
356+ extPort := nat .establishMapping (nat .ctx , e .protocol , e .port )
357+ nat .mappings [e ] = extPort
358+ if extPort != 0 {
359+ log .Info ("NAT mapping restored after rediscovery" , "protocol" , e .protocol , "internal_port" , e .port , "external_port" , extPort )
360+ }
361+ }
362+ nat .mappingmu .Unlock ()
363+
364+ log .Info ("NAT rediscovery successful" )
254365}
255366
256367func minTime (a , b time.Time ) time.Time {
@@ -259,3 +370,18 @@ func minTime(a, b time.Time) time.Time {
259370 }
260371 return b
261372}
373+
374+ // getExternalAddress retrieves and parses the external address from a NAT instance
375+ func getExternalAddress (natInstance nat.NAT ) netip.Addr {
376+ extIP , err := natInstance .GetExternalAddress ()
377+ if err != nil {
378+ log .Debug ("Failed to get external address" , "err" , err )
379+ return netip.Addr {}
380+ }
381+ extAddr , ok := netip .AddrFromSlice (extIP )
382+ if ! ok {
383+ log .Debug ("Failed to parse external address" , "ip" , extIP )
384+ return netip.Addr {}
385+ }
386+ return extAddr
387+ }
0 commit comments