Skip to content

Commit 658c408

Browse files
committed
Re-fail perm-failed HTLCs on startup in case of MonitorEvent loss
`MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets persisted (i.e. due to a block update) then the node crashes, prior to persisting the `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be lost. This isn't likely in a sync persist environment, but in an async one this could be an issue. Note that this is only an issue for closed channels - `MonitorEvent`s only inform the `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved on-chain post closure. Of the three, only the last is problematic to lose prior to a reload. In a previous commit we handled the case of claimed HTLCs by replaying payment preimages on startup to avoid `MonitorEvent` loss causing us to miss an HTLC claim. Here we handle the HTLC-failed case similarly. Unlike with HTLC claims via preimage, we don't already have replay logic in `ChannelManager` startup, but its easy enough to add one. Luckily, we already track when an HTLC reaches permanently-failed state in `ChannelMonitor` (i.e. it has `ANTI_REORG_DELAY` confirmations on-chain on the failing transaction), so all we need to do is add the ability to query for that and fail them on `ChannelManager` startup.
1 parent e7f13f6 commit 658c408

File tree

3 files changed

+268
-9
lines changed

3 files changed

+268
-9
lines changed

lightning/src/chain/channelmonitor.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2934,6 +2934,75 @@ impl<Signer: EcdsaChannelSigner> ChannelMonitor<Signer> {
29342934
res
29352935
}
29362936

2937+
/// Gets the set of outbound HTLCs which hit the chain and ultimately were claimed by us via
2938+
/// the timeout path and reached [`ANTI_REORG_DELAY`] confirmations. This is used to determine
2939+
/// if an HTLC has failed without the `ChannelManager` having seen it prior to being persisted.
2940+
pub(crate) fn get_onchain_failed_outbound_htlcs(
2941+
&self,
2942+
) -> HashMap<HTLCSource, HTLCOutputInCommitment> {
2943+
let us = self.inner.lock().unwrap();
2944+
// We're only concerned with the confirmation count of HTLC transactions, and don't
2945+
// actually care how many confirmations a commitment transaction may or may not have. Thus,
2946+
// we look for either a FundingSpendConfirmation event or a funding_spend_confirmed.
2947+
let confirmed_txid = us.funding_spend_confirmed.or_else(|| {
2948+
us.onchain_events_awaiting_threshold_conf.iter().find_map(|event| {
2949+
if let OnchainEvent::FundingSpendConfirmation { .. } = event.event {
2950+
Some(event.txid)
2951+
} else {
2952+
None
2953+
}
2954+
})
2955+
});
2956+
2957+
if confirmed_txid.is_none() {
2958+
return new_hash_map();
2959+
}
2960+
2961+
let mut res = new_hash_map();
2962+
macro_rules! walk_htlcs {
2963+
($holder_commitment: expr, $htlc_iter: expr) => {
2964+
for (htlc, source) in $htlc_iter {
2965+
let filter = |v: &&IrrevocablyResolvedHTLC| {
2966+
v.commitment_tx_output_idx == htlc.transaction_output_index
2967+
};
2968+
if let Some(state) = us.htlcs_resolved_on_chain.iter().filter(filter).next() {
2969+
if let Some(source) = source {
2970+
if state.payment_preimage.is_none() {
2971+
res.insert(source.clone(), htlc.clone());
2972+
}
2973+
}
2974+
}
2975+
}
2976+
};
2977+
}
2978+
2979+
let txid = confirmed_txid.unwrap();
2980+
if Some(txid) == us.funding.current_counterparty_commitment_txid
2981+
|| Some(txid) == us.funding.prev_counterparty_commitment_txid
2982+
{
2983+
walk_htlcs!(
2984+
false,
2985+
us.funding.counterparty_claimable_outpoints.get(&txid).unwrap().iter().filter_map(
2986+
|(a, b)| {
2987+
if let &Some(ref source) = b {
2988+
Some((a, Some(&**source)))
2989+
} else {
2990+
None
2991+
}
2992+
}
2993+
)
2994+
);
2995+
} else if txid == us.funding.current_holder_commitment_tx.trust().txid() {
2996+
walk_htlcs!(true, holder_commitment_htlcs!(us, CURRENT_WITH_SOURCES));
2997+
} else if let Some(prev_commitment_tx) = &us.funding.prev_holder_commitment_tx {
2998+
if txid == prev_commitment_tx.trust().txid() {
2999+
walk_htlcs!(true, holder_commitment_htlcs!(us, PREV_WITH_SOURCES).unwrap());
3000+
}
3001+
}
3002+
3003+
res
3004+
}
3005+
29373006
/// Gets the set of outbound HTLCs which are pending resolution in this channel or which were
29383007
/// resolved with a preimage from our counterparty.
29393008
///

lightning/src/ln/channelmanager.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15650,7 +15650,7 @@ where
1565015650
log_error!(logger, " The ChannelMonitor for channel {} is at counterparty commitment transaction number {} but the ChannelManager is at counterparty commitment transaction number {}.",
1565115651
&channel.context.channel_id(), monitor.get_cur_counterparty_commitment_number(), channel.get_cur_counterparty_commitment_transaction_number());
1565215652
}
15653-
let mut shutdown_result =
15653+
let shutdown_result =
1565415654
channel.force_shutdown(ClosureReason::OutdatedChannelManager);
1565515655
if shutdown_result.unbroadcasted_batch_funding_txid.is_some() {
1565615656
return Err(DecodeError::InvalidValue);
@@ -15682,7 +15682,10 @@ where
1568215682
},
1568315683
);
1568415684
}
15685-
failed_htlcs.append(&mut shutdown_result.dropped_outbound_htlcs);
15685+
for (source, hash, cp_id, chan_id) in shutdown_result.dropped_outbound_htlcs {
15686+
let reason = LocalHTLCFailureReason::ChannelClosed;
15687+
failed_htlcs.push((source, hash, cp_id, chan_id, reason));
15688+
}
1568615689
channel_closures.push_back((
1568715690
events::Event::ChannelClosed {
1568815691
channel_id: channel.context.channel_id(),
@@ -15724,6 +15727,7 @@ where
1572415727
*payment_hash,
1572515728
channel.context.get_counterparty_node_id(),
1572615729
channel.context.channel_id(),
15730+
LocalHTLCFailureReason::ChannelClosed,
1572715731
));
1572815732
}
1572915733
}
@@ -16446,6 +16450,20 @@ where
1644616450
},
1644716451
}
1644816452
}
16453+
for (htlc_source, htlc) in monitor.get_onchain_failed_outbound_htlcs() {
16454+
log_info!(
16455+
args.logger,
16456+
"Failing HTLC with payment hash {} as it was resolved on-chain.",
16457+
htlc.payment_hash
16458+
);
16459+
failed_htlcs.push((
16460+
htlc_source,
16461+
htlc.payment_hash,
16462+
monitor.get_counterparty_node_id(),
16463+
monitor.channel_id(),
16464+
LocalHTLCFailureReason::OnChainTimeout,
16465+
));
16466+
}
1644916467
}
1645016468

1645116469
// Whether the downstream channel was closed or not, try to re-apply any payment
@@ -17128,13 +17146,10 @@ where
1712817146
}
1712917147
}
1713017148

17131-
for htlc_source in failed_htlcs.drain(..) {
17132-
let (source, payment_hash, counterparty_node_id, channel_id) = htlc_source;
17133-
let failure_reason = LocalHTLCFailureReason::ChannelClosed;
17134-
let receiver = HTLCHandlingFailureType::Forward {
17135-
node_id: Some(counterparty_node_id),
17136-
channel_id,
17137-
};
17149+
for htlc_source in failed_htlcs {
17150+
let (source, payment_hash, counterparty_id, channel_id, failure_reason) = htlc_source;
17151+
let receiver =
17152+
HTLCHandlingFailureType::Forward { node_id: Some(counterparty_id), channel_id };
1713817153
let reason = HTLCFailReason::from_failure_code(failure_reason);
1713917154
channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
1714017155
}

lightning/src/ln/monitor_tests.rs

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3374,3 +3374,178 @@ fn test_lost_preimage_monitor_events() {
33743374
do_test_lost_preimage_monitor_events(true);
33753375
do_test_lost_preimage_monitor_events(false);
33763376
}
3377+
3378+
fn do_test_lost_timeout_monitor_events(on_counterparty_tx: bool) {
3379+
// `MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the
3380+
// `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets
3381+
// persisted (i.e. due to a block update) then the node crashes, prior to persisting the
3382+
// `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be
3383+
// lost. This isn't likely in a sync persist environment, but in an async one this could be an
3384+
// issue.
3385+
//
3386+
// Note that this is only an issue for closed channels - `MonitorEvent`s only inform the
3387+
// `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup
3388+
// or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes
3389+
// completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved
3390+
// on-chain post closure. Of the three, only the last is problematic to lose prior to a reload.
3391+
//
3392+
// Here we test that losing `MonitorEvent`s that contain HTLC resolution via timeouts does not
3393+
// cause us to lose a `PaymentFailed` event.
3394+
let mut cfg = test_default_channel_config();
3395+
cfg.manually_accept_inbound_channels = true;
3396+
cfg.channel_handshake_config.negotiate_anchors_zero_fee_htlc_tx = true;
3397+
let cfgs = [Some(cfg.clone()), Some(cfg.clone()), Some(cfg.clone())];
3398+
3399+
let chanmon_cfgs = create_chanmon_cfgs(3);
3400+
let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
3401+
let persister;
3402+
let new_chain_mon;
3403+
let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &cfgs);
3404+
let node_b_reload;
3405+
let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
3406+
3407+
provide_anchor_reserves(&nodes);
3408+
3409+
let node_a_id = nodes[0].node.get_our_node_id();
3410+
let node_b_id = nodes[1].node.get_our_node_id();
3411+
let node_c_id = nodes[2].node.get_our_node_id();
3412+
3413+
let chan_a = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 0).2;
3414+
let chan_b = create_announced_chan_between_nodes_with_value(&nodes, 1, 2, 1_000_000, 0).2;
3415+
3416+
// Ensure all nodes are at the same height
3417+
let node_max_height =
3418+
nodes.iter().map(|node| node.blocks.lock().unwrap().len()).max().unwrap() as u32;
3419+
connect_blocks(&nodes[0], node_max_height - nodes[0].best_block_info().1);
3420+
connect_blocks(&nodes[1], node_max_height - nodes[1].best_block_info().1);
3421+
connect_blocks(&nodes[2], node_max_height - nodes[2].best_block_info().1);
3422+
3423+
let (_, hash_a, ..) = route_payment(&nodes[0], &[&nodes[1], &nodes[2]], 5_000_000);
3424+
let (_, hash_b, ..) = route_payment(&nodes[1], &[&nodes[2]], 5_000_000);
3425+
3426+
nodes[1].node.peer_disconnected(nodes[2].node.get_our_node_id());
3427+
nodes[2].node.peer_disconnected(nodes[1].node.get_our_node_id());
3428+
3429+
// Force-close the channel, confirming a commitment transaction then letting C claim the HTLCs.
3430+
let message = "Closed".to_owned();
3431+
nodes[2]
3432+
.node
3433+
.force_close_broadcasting_latest_txn(&chan_b, &node_b_id, message.clone())
3434+
.unwrap();
3435+
check_added_monitors(&nodes[2], 1);
3436+
let c_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3437+
check_closed_event!(nodes[2], 1, c_reason, [node_b_id], 1_000_000);
3438+
check_closed_broadcast!(nodes[2], true);
3439+
3440+
let cs_commit_tx = nodes[2].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3441+
assert_eq!(cs_commit_tx.len(), 1);
3442+
3443+
let message = "Closed".to_owned();
3444+
nodes[1]
3445+
.node
3446+
.force_close_broadcasting_latest_txn(&chan_b, &node_c_id, message.clone())
3447+
.unwrap();
3448+
check_added_monitors(&nodes[1], 1);
3449+
let b_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3450+
check_closed_event!(nodes[1], 1, b_reason, [node_c_id], 1_000_000);
3451+
check_closed_broadcast!(nodes[1], true);
3452+
3453+
let bs_commit_tx = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3454+
assert_eq!(bs_commit_tx.len(), 1);
3455+
3456+
let selected_commit_tx = if on_counterparty_tx {
3457+
&cs_commit_tx[0]
3458+
} else {
3459+
&bs_commit_tx[0]
3460+
};
3461+
3462+
mine_transaction(&nodes[1], selected_commit_tx);
3463+
// If the block gets connected first we may re-broadcast B's commitment transaction before
3464+
// seeing the C's confirm.
3465+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clear();
3466+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3467+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3468+
if on_counterparty_tx {
3469+
assert_eq!(events.len(), 1, "{events:?}");
3470+
match events[0] {
3471+
Event::SpendableOutputs { .. } => {},
3472+
_ => panic!("Unexpected event {events:?}"),
3473+
}
3474+
} else {
3475+
assert_eq!(events.len(), 0);
3476+
}
3477+
3478+
connect_blocks(&nodes[1], TEST_FINAL_CLTV - ANTI_REORG_DELAY + 1);
3479+
if !on_counterparty_tx {
3480+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3481+
assert_eq!(events.len(), 1, "{events:?}");
3482+
match events.pop().unwrap() {
3483+
Event::BumpTransaction(bump_event) => {
3484+
nodes[1].bump_tx_handler.handle_event(&bump_event);
3485+
},
3486+
_ => panic!("Unexpected event"),
3487+
}
3488+
}
3489+
let bs_htlc_timeouts =
3490+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3491+
assert_eq!(bs_htlc_timeouts.len(), 1);
3492+
3493+
// Now replay the timeouts on node B, which after 6 confirmations should fail the HTLCs via
3494+
// `MonitorUpdate`s
3495+
mine_transactions(&nodes[1], &bs_htlc_timeouts.iter().collect::<Vec<_>>());
3496+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3497+
3498+
// Now simulate a restart where the B<->C ChannelMonitor has been persisted (i.e. because we
3499+
// just processed a new block) but the ChannelManager was not. This should be exceedingly rare
3500+
// given we have to be connecting a block at the right moment and not manage to get a
3501+
// ChannelManager persisted after it does a thing that should immediately precede persistence,
3502+
// but with async persist it is more common.
3503+
//
3504+
// We do this by wiping the `MonitorEvent`s from the monitors and then reloading with the
3505+
// latest state.
3506+
let mon_events = nodes[1].chain_monitor.chain_monitor.release_pending_monitor_events();
3507+
assert_eq!(mon_events.len(), 1);
3508+
assert_eq!(mon_events[0].2.len(), 2);
3509+
3510+
let node_ser = nodes[1].node.encode();
3511+
let mon_a_ser = get_monitor!(nodes[1], chan_a).encode();
3512+
let mon_b_ser = get_monitor!(nodes[1], chan_b).encode();
3513+
let mons = &[&mon_a_ser[..], &mon_b_ser[..]];
3514+
reload_node!(nodes[1], cfg, &node_ser, mons, persister, new_chain_mon, node_b_reload);
3515+
3516+
let timeout_events = nodes[1].node.get_and_clear_pending_events();
3517+
assert_eq!(timeout_events.len(), 3, "{timeout_events:?}");
3518+
for ev in timeout_events {
3519+
match ev {
3520+
Event::PaymentPathFailed { payment_hash, .. } => {
3521+
assert_eq!(payment_hash, hash_b);
3522+
},
3523+
Event::PaymentFailed { payment_hash, .. } => {
3524+
assert_eq!(payment_hash, Some(hash_b));
3525+
},
3526+
Event::HTLCHandlingFailed { prev_channel_id, .. } => {
3527+
assert_eq!(prev_channel_id, chan_a);
3528+
},
3529+
_ => panic!("Wrong event {ev:?}"),
3530+
}
3531+
}
3532+
3533+
nodes[0].node.peer_disconnected(nodes[1].node.get_our_node_id());
3534+
3535+
let mut reconnect_args = ReconnectArgs::new(&nodes[0], &nodes[1]);
3536+
reconnect_args.pending_cell_htlc_fails = (0, 0);
3537+
reconnect_nodes(reconnect_args);
3538+
3539+
nodes[1].node.process_pending_htlc_forwards();
3540+
check_added_monitors(&nodes[1], 1);
3541+
let bs_fail = get_htlc_update_msgs(&nodes[1], &node_a_id);
3542+
nodes[0].node.handle_update_fail_htlc(node_b_id, &bs_fail.update_fail_htlcs[0]);
3543+
commitment_signed_dance!(nodes[0], nodes[1], bs_fail.commitment_signed, true, true);
3544+
expect_payment_failed!(nodes[0], hash_a, false);
3545+
}
3546+
3547+
#[test]
3548+
fn test_lost_timeout_monitor_events() {
3549+
do_test_lost_timeout_monitor_events(true);
3550+
do_test_lost_timeout_monitor_events(false);
3551+
}

0 commit comments

Comments
 (0)