Skip to content

Commit c2f7e6a

Browse files
committed
routing: fail stale HTLC attempts during startup
This commit adds a check during router's startup and fails the inflight HTLCs if they are routing using channels unknown to us. The channels are unknown because they are already closed, usually long time ago.
1 parent b998ce1 commit c2f7e6a

File tree

3 files changed

+154
-15
lines changed

3 files changed

+154
-15
lines changed

routing/router.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,13 @@ type Config struct {
285285
// ApplyChannelUpdate can be called to apply a new channel update to the
286286
// graph that we received from a payment failure.
287287
ApplyChannelUpdate func(msg *lnwire.ChannelUpdate) bool
288+
289+
// FetchClosedChannels is used by the router to fetch closed channels.
290+
//
291+
// TODO(yy): remove this method once the root cause of stuck payments
292+
// is found.
293+
FetchClosedChannels func(pendingOnly bool) (
294+
[]*channeldb.ChannelCloseSummary, error)
288295
}
289296

290297
// EdgeLocator is a struct used to identify a specific edge.
@@ -1384,6 +1391,19 @@ func (r *ChannelRouter) BuildRoute(amt fn.Option[lnwire.MilliSatoshi],
13841391
// resumePayments fetches inflight payments and resumes their payment
13851392
// lifecycles.
13861393
func (r *ChannelRouter) resumePayments() error {
1394+
// Get a list of closed channels.
1395+
channels, err := r.cfg.FetchClosedChannels(false)
1396+
if err != nil {
1397+
return err
1398+
}
1399+
1400+
closedSCIDs := make(map[lnwire.ShortChannelID]struct{}, len(channels))
1401+
for _, c := range channels {
1402+
if !c.IsPending {
1403+
closedSCIDs[c.ShortChanID] = struct{}{}
1404+
}
1405+
}
1406+
13871407
// Get all payments that are inflight.
13881408
payments, err := r.cfg.Control.FetchInFlightPayments()
13891409
if err != nil {
@@ -1399,6 +1419,12 @@ func (r *ChannelRouter) resumePayments() error {
13991419
for _, p := range payments {
14001420
for _, a := range p.HTLCs {
14011421
toKeep[a.AttemptID] = struct{}{}
1422+
1423+
// Try to fail the attempt if the route contains a dead
1424+
// channel.
1425+
r.failStaleAttempt(
1426+
a, p.Info.PaymentIdentifier, closedSCIDs,
1427+
)
14021428
}
14031429
}
14041430

@@ -1473,6 +1499,108 @@ func (r *ChannelRouter) resumePayments() error {
14731499
return nil
14741500
}
14751501

1502+
// failStaleAttempt will fail an HTLC attempt if it's using an unknown channel
1503+
// in its route. It first consults the switch to see if there's already a
1504+
// network result stored for this attempt. If not, it will check whether the
1505+
// first hop of this attempt is using an active channel known to us. If
1506+
// inactive, this attempt will be failed.
1507+
//
1508+
// NOTE: there's an unknown bug that caused the network result for a particular
1509+
// attempt to NOT be saved, resulting a payment being stuck forever. More info:
1510+
// - https://github.com/lightningnetwork/lnd/issues/8146
1511+
// - https://github.com/lightningnetwork/lnd/pull/8174
1512+
func (r *ChannelRouter) failStaleAttempt(a channeldb.HTLCAttempt,
1513+
payHash lntypes.Hash, closedSCIDs map[lnwire.ShortChannelID]struct{}) {
1514+
1515+
// We can only fail inflight HTLCs so we skip the settled/failed ones.
1516+
if a.Failure != nil || a.Settle != nil {
1517+
return
1518+
}
1519+
1520+
// First, check if we've already had a network result for this attempt.
1521+
// If no result is found, we'll check whether the reference link is
1522+
// still known to us.
1523+
ok, err := r.cfg.Payer.HasAttemptResult(a.AttemptID)
1524+
if err != nil {
1525+
log.Errorf("Failed to fetch network result for attempt=%v",
1526+
a.AttemptID)
1527+
return
1528+
}
1529+
1530+
// There's already a network result, no need to fail it here as the
1531+
// payment lifecycle will take care of it, so we can exit early.
1532+
if ok {
1533+
log.Debugf("Already have network result for attempt=%v",
1534+
a.AttemptID)
1535+
return
1536+
}
1537+
1538+
// We now need to decide whether this attempt should be failed here.
1539+
// For very old payments, it's possible that the network results were
1540+
// never saved, causing the payments to be stuck inflight. We now check
1541+
// whether the first hop is referencing an active channel ID and, if
1542+
// not, we will fail the attempt as it has no way to be retried again.
1543+
var shouldFail bool
1544+
1545+
// Validate that the attempt has hop info. If this attempt has no hop
1546+
// info it indicates an error in our db.
1547+
if len(a.Route.Hops) == 0 {
1548+
log.Errorf("Found empty hop for attempt=%v", a.AttemptID)
1549+
1550+
shouldFail = true
1551+
} else {
1552+
// Get the short channel ID.
1553+
chanID := a.Route.Hops[0].ChannelID
1554+
scid := lnwire.NewShortChanIDFromInt(chanID)
1555+
1556+
// Check whether this link is active. If so, we won't fail the
1557+
// attempt but keep waiting for its result.
1558+
_, err := r.cfg.GetLink(scid)
1559+
if err == nil {
1560+
return
1561+
}
1562+
1563+
// We should get the link not found err. If not, we will log an
1564+
// error and skip failing this attempt since an unknown error
1565+
// occurred.
1566+
if !errors.Is(err, htlcswitch.ErrChannelLinkNotFound) {
1567+
log.Errorf("Failed to get link for attempt=%v for "+
1568+
"payment=%v: %v", a.AttemptID, payHash, err)
1569+
1570+
return
1571+
}
1572+
1573+
// The channel link is not active, we now check whether this
1574+
// channel is already closed. If so, we fail it as there's no
1575+
// need to wait for the network result because it won't be
1576+
// re-sent. If the channel is still pending, we'll keep waiting
1577+
// for the result as we may get a contract resolution for this
1578+
// HTLC.
1579+
if _, ok := closedSCIDs[scid]; ok {
1580+
shouldFail = true
1581+
}
1582+
}
1583+
1584+
// Exit if there's no need to fail.
1585+
if !shouldFail {
1586+
return
1587+
}
1588+
1589+
log.Errorf("Failing stale attempt=%v for payment=%v", a.AttemptID,
1590+
payHash)
1591+
1592+
// Fail the attempt in db. If there's an error, there's nothing we can
1593+
// do here but logging it.
1594+
failInfo := &channeldb.HTLCFailInfo{
1595+
Reason: channeldb.HTLCFailUnknown,
1596+
FailTime: r.cfg.Clock.Now(),
1597+
}
1598+
_, err = r.cfg.Control.FailAttempt(payHash, a.AttemptID, failInfo)
1599+
if err != nil {
1600+
log.Errorf("Fail attempt=%v got error: %v", a.AttemptID, err)
1601+
}
1602+
}
1603+
14761604
// getEdgeUnifiers returns a list of edge unifiers for the given route.
14771605
func getEdgeUnifiers(source route.Vertex, hops []route.Vertex,
14781606
outgoingChans map[uint64]struct{},

routing/router_test.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ func (c *testCtx) getChannelIDFromAlias(t *testing.T, a, b string) uint64 {
9393
return channelID
9494
}
9595

96+
func mockFetchClosedChannels(_ bool) ([]*channeldb.ChannelCloseSummary, error) {
97+
return nil, nil
98+
}
99+
96100
func createTestCtxFromGraphInstance(t *testing.T, startingHeight uint32,
97101
graphInstance *testGraphInstance) *testCtx {
98102

@@ -158,9 +162,10 @@ func createTestCtxFromGraphInstanceAssumeValid(t *testing.T,
158162
next := atomic.AddUint64(&uniquePaymentID, 1)
159163
return next, nil
160164
},
161-
PathFindingConfig: pathFindingConfig,
162-
Clock: clock.NewTestClock(time.Unix(1, 0)),
163-
ApplyChannelUpdate: graphBuilder.ApplyChannelUpdate,
165+
PathFindingConfig: pathFindingConfig,
166+
Clock: clock.NewTestClock(time.Unix(1, 0)),
167+
ApplyChannelUpdate: graphBuilder.ApplyChannelUpdate,
168+
FetchClosedChannels: mockFetchClosedChannels,
164169
})
165170
require.NoError(t, router.Start(), "unable to start router")
166171

@@ -2170,6 +2175,7 @@ func TestSendToRouteSkipTempErrSuccess(t *testing.T) {
21702175
NextPaymentID: func() (uint64, error) {
21712176
return 0, nil
21722177
},
2178+
FetchClosedChannels: mockFetchClosedChannels,
21732179
}}
21742180

21752181
// Register mockers with the expected method calls.
@@ -2253,6 +2259,7 @@ func TestSendToRouteSkipTempErrNonMPP(t *testing.T) {
22532259
NextPaymentID: func() (uint64, error) {
22542260
return 0, nil
22552261
},
2262+
FetchClosedChannels: mockFetchClosedChannels,
22562263
}}
22572264

22582265
// Expect an error to be returned.
@@ -2307,6 +2314,7 @@ func TestSendToRouteSkipTempErrTempFailure(t *testing.T) {
23072314
NextPaymentID: func() (uint64, error) {
23082315
return 0, nil
23092316
},
2317+
FetchClosedChannels: mockFetchClosedChannels,
23102318
}}
23112319

23122320
// Create the error to be returned.
@@ -2389,6 +2397,7 @@ func TestSendToRouteSkipTempErrPermanentFailure(t *testing.T) {
23892397
NextPaymentID: func() (uint64, error) {
23902398
return 0, nil
23912399
},
2400+
FetchClosedChannels: mockFetchClosedChannels,
23922401
}}
23932402

23942403
// Create the error to be returned.
@@ -2475,6 +2484,7 @@ func TestSendToRouteTempFailure(t *testing.T) {
24752484
NextPaymentID: func() (uint64, error) {
24762485
return 0, nil
24772486
},
2487+
FetchClosedChannels: mockFetchClosedChannels,
24782488
}}
24792489

24802490
// Create the error to be returned.

server.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -993,18 +993,19 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
993993
}
994994

995995
s.chanRouter, err = routing.New(routing.Config{
996-
SelfNode: selfNode.PubKeyBytes,
997-
RoutingGraph: graphsession.NewRoutingGraph(chanGraph),
998-
Chain: cc.ChainIO,
999-
Payer: s.htlcSwitch,
1000-
Control: s.controlTower,
1001-
MissionControl: s.missionControl,
1002-
SessionSource: paymentSessionSource,
1003-
GetLink: s.htlcSwitch.GetLinkByShortID,
1004-
NextPaymentID: sequencer.NextID,
1005-
PathFindingConfig: pathFindingConfig,
1006-
Clock: clock.NewDefaultClock(),
1007-
ApplyChannelUpdate: s.graphBuilder.ApplyChannelUpdate,
996+
SelfNode: selfNode.PubKeyBytes,
997+
RoutingGraph: graphsession.NewRoutingGraph(chanGraph),
998+
Chain: cc.ChainIO,
999+
Payer: s.htlcSwitch,
1000+
Control: s.controlTower,
1001+
MissionControl: s.missionControl,
1002+
SessionSource: paymentSessionSource,
1003+
GetLink: s.htlcSwitch.GetLinkByShortID,
1004+
NextPaymentID: sequencer.NextID,
1005+
PathFindingConfig: pathFindingConfig,
1006+
Clock: clock.NewDefaultClock(),
1007+
ApplyChannelUpdate: s.graphBuilder.ApplyChannelUpdate,
1008+
FetchClosedChannels: s.chanStateDB.FetchClosedChannels,
10081009
})
10091010
if err != nil {
10101011
return nil, fmt.Errorf("can't create router: %w", err)

0 commit comments

Comments
 (0)