@@ -285,6 +285,13 @@ type Config struct {
285285 // ApplyChannelUpdate can be called to apply a new channel update to the
286286 // graph that we received from a payment failure.
287287 ApplyChannelUpdate func (msg * lnwire.ChannelUpdate ) bool
288+
289+ // FetchClosedChannels is used by the router to fetch closed channels.
290+ //
291+ // TODO(yy): remove this method once the root cause of stuck payments
292+ // is found.
293+ FetchClosedChannels func (pendingOnly bool ) (
294+ []* channeldb.ChannelCloseSummary , error )
288295}
289296
290297// EdgeLocator is a struct used to identify a specific edge.
@@ -1384,6 +1391,19 @@ func (r *ChannelRouter) BuildRoute(amt fn.Option[lnwire.MilliSatoshi],
13841391// resumePayments fetches inflight payments and resumes their payment
13851392// lifecycles.
13861393func (r * ChannelRouter ) resumePayments () error {
1394+ // Get a list of closed channels.
1395+ channels , err := r .cfg .FetchClosedChannels (false )
1396+ if err != nil {
1397+ return err
1398+ }
1399+
1400+ closedSCIDs := make (map [lnwire.ShortChannelID ]struct {}, len (channels ))
1401+ for _ , c := range channels {
1402+ if ! c .IsPending {
1403+ closedSCIDs [c .ShortChanID ] = struct {}{}
1404+ }
1405+ }
1406+
13871407 // Get all payments that are inflight.
13881408 payments , err := r .cfg .Control .FetchInFlightPayments ()
13891409 if err != nil {
@@ -1399,6 +1419,12 @@ func (r *ChannelRouter) resumePayments() error {
13991419 for _ , p := range payments {
14001420 for _ , a := range p .HTLCs {
14011421 toKeep [a .AttemptID ] = struct {}{}
1422+
1423+ // Try to fail the attempt if the route contains a dead
1424+ // channel.
1425+ r .failStaleAttempt (
1426+ a , p .Info .PaymentIdentifier , closedSCIDs ,
1427+ )
14021428 }
14031429 }
14041430
@@ -1473,6 +1499,108 @@ func (r *ChannelRouter) resumePayments() error {
14731499 return nil
14741500}
14751501
1502+ // failStaleAttempt will fail an HTLC attempt if it's using an unknown channel
1503+ // in its route. It first consults the switch to see if there's already a
1504+ // network result stored for this attempt. If not, it will check whether the
1505+ // first hop of this attempt is using an active channel known to us. If
1506+ // inactive, this attempt will be failed.
1507+ //
1508+ // NOTE: there's an unknown bug that caused the network result for a particular
1509+ // attempt to NOT be saved, resulting a payment being stuck forever. More info:
1510+ // - https://github.com/lightningnetwork/lnd/issues/8146
1511+ // - https://github.com/lightningnetwork/lnd/pull/8174
1512+ func (r * ChannelRouter ) failStaleAttempt (a channeldb.HTLCAttempt ,
1513+ payHash lntypes.Hash , closedSCIDs map [lnwire.ShortChannelID ]struct {}) {
1514+
1515+ // We can only fail inflight HTLCs so we skip the settled/failed ones.
1516+ if a .Failure != nil || a .Settle != nil {
1517+ return
1518+ }
1519+
1520+ // First, check if we've already had a network result for this attempt.
1521+ // If no result is found, we'll check whether the reference link is
1522+ // still known to us.
1523+ ok , err := r .cfg .Payer .HasAttemptResult (a .AttemptID )
1524+ if err != nil {
1525+ log .Errorf ("Failed to fetch network result for attempt=%v" ,
1526+ a .AttemptID )
1527+ return
1528+ }
1529+
1530+ // There's already a network result, no need to fail it here as the
1531+ // payment lifecycle will take care of it, so we can exit early.
1532+ if ok {
1533+ log .Debugf ("Already have network result for attempt=%v" ,
1534+ a .AttemptID )
1535+ return
1536+ }
1537+
1538+ // We now need to decide whether this attempt should be failed here.
1539+ // For very old payments, it's possible that the network results were
1540+ // never saved, causing the payments to be stuck inflight. We now check
1541+ // whether the first hop is referencing an active channel ID and, if
1542+ // not, we will fail the attempt as it has no way to be retried again.
1543+ var shouldFail bool
1544+
1545+ // Validate that the attempt has hop info. If this attempt has no hop
1546+ // info it indicates an error in our db.
1547+ if len (a .Route .Hops ) == 0 {
1548+ log .Errorf ("Found empty hop for attempt=%v" , a .AttemptID )
1549+
1550+ shouldFail = true
1551+ } else {
1552+ // Get the short channel ID.
1553+ chanID := a .Route .Hops [0 ].ChannelID
1554+ scid := lnwire .NewShortChanIDFromInt (chanID )
1555+
1556+ // Check whether this link is active. If so, we won't fail the
1557+ // attempt but keep waiting for its result.
1558+ _ , err := r .cfg .GetLink (scid )
1559+ if err == nil {
1560+ return
1561+ }
1562+
1563+ // We should get the link not found err. If not, we will log an
1564+ // error and skip failing this attempt since an unknown error
1565+ // occurred.
1566+ if ! errors .Is (err , htlcswitch .ErrChannelLinkNotFound ) {
1567+ log .Errorf ("Failed to get link for attempt=%v for " +
1568+ "payment=%v: %v" , a .AttemptID , payHash , err )
1569+
1570+ return
1571+ }
1572+
1573+ // The channel link is not active, we now check whether this
1574+ // channel is already closed. If so, we fail it as there's no
1575+ // need to wait for the network result because it won't be
1576+ // re-sent. If the channel is still pending, we'll keep waiting
1577+ // for the result as we may get a contract resolution for this
1578+ // HTLC.
1579+ if _ , ok := closedSCIDs [scid ]; ok {
1580+ shouldFail = true
1581+ }
1582+ }
1583+
1584+ // Exit if there's no need to fail.
1585+ if ! shouldFail {
1586+ return
1587+ }
1588+
1589+ log .Errorf ("Failing stale attempt=%v for payment=%v" , a .AttemptID ,
1590+ payHash )
1591+
1592+ // Fail the attempt in db. If there's an error, there's nothing we can
1593+ // do here but logging it.
1594+ failInfo := & channeldb.HTLCFailInfo {
1595+ Reason : channeldb .HTLCFailUnknown ,
1596+ FailTime : r .cfg .Clock .Now (),
1597+ }
1598+ _ , err = r .cfg .Control .FailAttempt (payHash , a .AttemptID , failInfo )
1599+ if err != nil {
1600+ log .Errorf ("Fail attempt=%v got error: %v" , a .AttemptID , err )
1601+ }
1602+ }
1603+
14761604// getEdgeUnifiers returns a list of edge unifiers for the given route.
14771605func getEdgeUnifiers (source route.Vertex , hops []route.Vertex ,
14781606 outgoingChans map [uint64 ]struct {},
0 commit comments