Skip to content

Commit bc30b03

Browse files
committed
Re-fail perm-failed HTLCs on startup in case of MonitorEvent loss
`MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets persisted (i.e. due to a block update) then the node crashes, prior to persisting the `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be lost. This isn't likely in a sync persist environment, but in an async one this could be an issue. Note that this is only an issue for closed channels - `MonitorEvent`s only inform the `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved on-chain post closure. Of the three, only the last is problematic to lose prior to a reload. In a previous commit we handled the case of claimed HTLCs by replaying payment preimages on startup to avoid `MonitorEvent` loss causing us to miss an HTLC claim. Here we handle the HTLC-failed case similarly. Unlike with HTLC claims via preimage, we don't already have replay logic in `ChannelManager` startup, but its easy enough to add one. Luckily, we already track when an HTLC reaches permanently-failed state in `ChannelMonitor` (i.e. it has `ANTI_REORG_DELAY` confirmations on-chain on the failing transaction), so all we need to do is add the ability to query for that and fail them on `ChannelManager` startup.
1 parent cac2479 commit bc30b03

File tree

3 files changed

+270
-9
lines changed

3 files changed

+270
-9
lines changed

lightning/src/chain/channelmonitor.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2983,6 +2983,75 @@ impl<Signer: EcdsaChannelSigner> ChannelMonitor<Signer> {
29832983
res
29842984
}
29852985

2986+
/// Gets the set of outbound HTLCs which hit the chain and ultimately were claimed by us via
2987+
/// the timeout path and reached [`ANTI_REORG_DELAY`] confirmations. This is used to determine
2988+
/// if an HTLC has failed without the `ChannelManager` having seen it prior to being persisted.
2989+
pub(crate) fn get_onchain_failed_outbound_htlcs(
2990+
&self,
2991+
) -> HashMap<HTLCSource, HTLCOutputInCommitment> {
2992+
let us = self.inner.lock().unwrap();
2993+
// We're only concerned with the confirmation count of HTLC transactions, and don't
2994+
// actually care how many confirmations a commitment transaction may or may not have. Thus,
2995+
// we look for either a FundingSpendConfirmation event or a funding_spend_confirmed.
2996+
let confirmed_txid = us.funding_spend_confirmed.or_else(|| {
2997+
us.onchain_events_awaiting_threshold_conf.iter().find_map(|event| {
2998+
if let OnchainEvent::FundingSpendConfirmation { .. } = event.event {
2999+
Some(event.txid)
3000+
} else {
3001+
None
3002+
}
3003+
})
3004+
});
3005+
3006+
if confirmed_txid.is_none() {
3007+
return new_hash_map();
3008+
}
3009+
3010+
let mut res = new_hash_map();
3011+
macro_rules! walk_htlcs {
3012+
($holder_commitment: expr, $htlc_iter: expr) => {
3013+
for (htlc, source) in $htlc_iter {
3014+
let filter = |v: &&IrrevocablyResolvedHTLC| {
3015+
v.commitment_tx_output_idx == htlc.transaction_output_index
3016+
};
3017+
if let Some(state) = us.htlcs_resolved_on_chain.iter().filter(filter).next() {
3018+
if let Some(source) = source {
3019+
if state.payment_preimage.is_none() {
3020+
res.insert(source.clone(), htlc.clone());
3021+
}
3022+
}
3023+
}
3024+
}
3025+
};
3026+
}
3027+
3028+
let txid = confirmed_txid.unwrap();
3029+
if Some(txid) == us.funding.current_counterparty_commitment_txid
3030+
|| Some(txid) == us.funding.prev_counterparty_commitment_txid
3031+
{
3032+
walk_htlcs!(
3033+
false,
3034+
us.funding.counterparty_claimable_outpoints.get(&txid).unwrap().iter().filter_map(
3035+
|(a, b)| {
3036+
if let &Some(ref source) = b {
3037+
Some((a, Some(&**source)))
3038+
} else {
3039+
None
3040+
}
3041+
}
3042+
)
3043+
);
3044+
} else if txid == us.funding.current_holder_commitment_tx.trust().txid() {
3045+
walk_htlcs!(true, holder_commitment_htlcs!(us, CURRENT_WITH_SOURCES));
3046+
} else if let Some(prev_commitment_tx) = &us.funding.prev_holder_commitment_tx {
3047+
if txid == prev_commitment_tx.trust().txid() {
3048+
walk_htlcs!(true, holder_commitment_htlcs!(us, PREV_WITH_SOURCES).unwrap());
3049+
}
3050+
}
3051+
3052+
res
3053+
}
3054+
29863055
/// Gets the set of outbound HTLCs which are pending resolution in this channel or which were
29873056
/// resolved with a preimage from our counterparty.
29883057
///

lightning/src/ln/channelmanager.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15739,7 +15739,7 @@ where
1573915739
log_error!(logger, " The ChannelMonitor for channel {} is at counterparty commitment transaction number {} but the ChannelManager is at counterparty commitment transaction number {}.",
1574015740
&channel.context.channel_id(), monitor.get_cur_counterparty_commitment_number(), channel.get_cur_counterparty_commitment_transaction_number());
1574115741
}
15742-
let mut shutdown_result =
15742+
let shutdown_result =
1574315743
channel.force_shutdown(ClosureReason::OutdatedChannelManager);
1574415744
if shutdown_result.unbroadcasted_batch_funding_txid.is_some() {
1574515745
return Err(DecodeError::InvalidValue);
@@ -15771,7 +15771,10 @@ where
1577115771
},
1577215772
);
1577315773
}
15774-
failed_htlcs.append(&mut shutdown_result.dropped_outbound_htlcs);
15774+
for (source, hash, cp_id, chan_id) in shutdown_result.dropped_outbound_htlcs {
15775+
let reason = LocalHTLCFailureReason::ChannelClosed;
15776+
failed_htlcs.push((source, hash, cp_id, chan_id, reason));
15777+
}
1577515778
channel_closures.push_back((
1577615779
events::Event::ChannelClosed {
1577715780
channel_id: channel.context.channel_id(),
@@ -15813,6 +15816,7 @@ where
1581315816
*payment_hash,
1581415817
channel.context.get_counterparty_node_id(),
1581515818
channel.context.channel_id(),
15819+
LocalHTLCFailureReason::ChannelClosed,
1581615820
));
1581715821
}
1581815822
}
@@ -16537,6 +16541,20 @@ where
1653716541
},
1653816542
}
1653916543
}
16544+
for (htlc_source, htlc) in monitor.get_onchain_failed_outbound_htlcs() {
16545+
log_info!(
16546+
args.logger,
16547+
"Failing HTLC with payment hash {} as it was resolved on-chain.",
16548+
htlc.payment_hash
16549+
);
16550+
failed_htlcs.push((
16551+
htlc_source,
16552+
htlc.payment_hash,
16553+
monitor.get_counterparty_node_id(),
16554+
monitor.channel_id(),
16555+
LocalHTLCFailureReason::OnChainTimeout,
16556+
));
16557+
}
1654016558
}
1654116559

1654216560
// Whether the downstream channel was closed or not, try to re-apply any payment
@@ -17217,13 +17235,10 @@ where
1721717235
}
1721817236
}
1721917237

17220-
for htlc_source in failed_htlcs.drain(..) {
17221-
let (source, payment_hash, counterparty_node_id, channel_id) = htlc_source;
17222-
let failure_reason = LocalHTLCFailureReason::ChannelClosed;
17223-
let receiver = HTLCHandlingFailureType::Forward {
17224-
node_id: Some(counterparty_node_id),
17225-
channel_id,
17226-
};
17238+
for htlc_source in failed_htlcs {
17239+
let (source, payment_hash, counterparty_id, channel_id, failure_reason) = htlc_source;
17240+
let receiver =
17241+
HTLCHandlingFailureType::Forward { node_id: Some(counterparty_id), channel_id };
1722717242
let reason = HTLCFailReason::from_failure_code(failure_reason);
1722817243
channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
1722917244
}

lightning/src/ln/monitor_tests.rs

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3463,3 +3463,180 @@ fn test_lost_preimage_monitor_events() {
34633463
do_test_lost_preimage_monitor_events(true);
34643464
do_test_lost_preimage_monitor_events(false);
34653465
}
3466+
3467+
fn do_test_lost_timeout_monitor_events(on_counterparty_tx: bool) {
3468+
// `MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the
3469+
// `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets
3470+
// persisted (i.e. due to a block update) then the node crashes, prior to persisting the
3471+
// `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be
3472+
// lost. This isn't likely in a sync persist environment, but in an async one this could be an
3473+
// issue.
3474+
//
3475+
// Note that this is only an issue for closed channels - `MonitorEvent`s only inform the
3476+
// `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup
3477+
// or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes
3478+
// completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved
3479+
// on-chain post closure. Of the three, only the last is problematic to lose prior to a reload.
3480+
//
3481+
// Here we test that losing `MonitorEvent`s that contain HTLC resolution via timeouts does not
3482+
// cause us to lose a `PaymentFailed` event.
3483+
let mut cfg = test_default_channel_config();
3484+
cfg.manually_accept_inbound_channels = true;
3485+
cfg.channel_handshake_config.negotiate_anchors_zero_fee_htlc_tx = true;
3486+
let cfgs = [Some(cfg.clone()), Some(cfg.clone()), Some(cfg.clone())];
3487+
3488+
let chanmon_cfgs = create_chanmon_cfgs(3);
3489+
let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
3490+
let persister;
3491+
let new_chain_mon;
3492+
let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &cfgs);
3493+
let node_b_reload;
3494+
let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
3495+
3496+
provide_anchor_reserves(&nodes);
3497+
3498+
let node_a_id = nodes[0].node.get_our_node_id();
3499+
let node_b_id = nodes[1].node.get_our_node_id();
3500+
let node_c_id = nodes[2].node.get_our_node_id();
3501+
3502+
let chan_a = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 0).2;
3503+
let chan_b = create_announced_chan_between_nodes_with_value(&nodes, 1, 2, 1_000_000, 0).2;
3504+
3505+
// Ensure all nodes are at the same height
3506+
let node_max_height =
3507+
nodes.iter().map(|node| node.blocks.lock().unwrap().len()).max().unwrap() as u32;
3508+
connect_blocks(&nodes[0], node_max_height - nodes[0].best_block_info().1);
3509+
connect_blocks(&nodes[1], node_max_height - nodes[1].best_block_info().1);
3510+
connect_blocks(&nodes[2], node_max_height - nodes[2].best_block_info().1);
3511+
3512+
let (_, hash_a, ..) = route_payment(&nodes[0], &[&nodes[1], &nodes[2]], 5_000_000);
3513+
let (_, hash_b, ..) = route_payment(&nodes[1], &[&nodes[2]], 5_000_000);
3514+
3515+
nodes[1].node.peer_disconnected(nodes[2].node.get_our_node_id());
3516+
nodes[2].node.peer_disconnected(nodes[1].node.get_our_node_id());
3517+
3518+
// Force-close the channel, confirming a commitment transaction then letting C claim the HTLCs.
3519+
let message = "Closed".to_owned();
3520+
nodes[2]
3521+
.node
3522+
.force_close_broadcasting_latest_txn(&chan_b, &node_b_id, message.clone())
3523+
.unwrap();
3524+
check_added_monitors(&nodes[2], 1);
3525+
let c_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3526+
check_closed_event!(nodes[2], 1, c_reason, [node_b_id], 1_000_000);
3527+
check_closed_broadcast!(nodes[2], true);
3528+
3529+
handle_bump_events(&nodes[2], true, 0);
3530+
let cs_commit_tx = nodes[2].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3531+
assert_eq!(cs_commit_tx.len(), 1);
3532+
3533+
let message = "Closed".to_owned();
3534+
nodes[1]
3535+
.node
3536+
.force_close_broadcasting_latest_txn(&chan_b, &node_c_id, message.clone())
3537+
.unwrap();
3538+
check_added_monitors(&nodes[1], 1);
3539+
let b_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3540+
check_closed_event!(nodes[1], 1, b_reason, [node_c_id], 1_000_000);
3541+
check_closed_broadcast!(nodes[1], true);
3542+
3543+
handle_bump_events(&nodes[1], true, 0);
3544+
let bs_commit_tx = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3545+
assert_eq!(bs_commit_tx.len(), 1);
3546+
3547+
let selected_commit_tx = if on_counterparty_tx {
3548+
&cs_commit_tx[0]
3549+
} else {
3550+
&bs_commit_tx[0]
3551+
};
3552+
3553+
mine_transaction(&nodes[1], selected_commit_tx);
3554+
// If the block gets connected first we may re-broadcast B's commitment transaction before
3555+
// seeing the C's confirm.
3556+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clear();
3557+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3558+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3559+
if on_counterparty_tx {
3560+
assert_eq!(events.len(), 1, "{events:?}");
3561+
match events[0] {
3562+
Event::SpendableOutputs { .. } => {},
3563+
_ => panic!("Unexpected event {events:?}"),
3564+
}
3565+
} else {
3566+
assert_eq!(events.len(), 0);
3567+
}
3568+
3569+
connect_blocks(&nodes[1], TEST_FINAL_CLTV - ANTI_REORG_DELAY + 1);
3570+
if !on_counterparty_tx {
3571+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3572+
assert_eq!(events.len(), 1, "{events:?}");
3573+
match events.pop().unwrap() {
3574+
Event::BumpTransaction(bump_event) => {
3575+
nodes[1].bump_tx_handler.handle_event(&bump_event);
3576+
},
3577+
_ => panic!("Unexpected event"),
3578+
}
3579+
}
3580+
let bs_htlc_timeouts =
3581+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3582+
assert_eq!(bs_htlc_timeouts.len(), 1);
3583+
3584+
// Now replay the timeouts on node B, which after 6 confirmations should fail the HTLCs via
3585+
// `MonitorUpdate`s
3586+
mine_transactions(&nodes[1], &bs_htlc_timeouts.iter().collect::<Vec<_>>());
3587+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3588+
3589+
// Now simulate a restart where the B<->C ChannelMonitor has been persisted (i.e. because we
3590+
// just processed a new block) but the ChannelManager was not. This should be exceedingly rare
3591+
// given we have to be connecting a block at the right moment and not manage to get a
3592+
// ChannelManager persisted after it does a thing that should immediately precede persistence,
3593+
// but with async persist it is more common.
3594+
//
3595+
// We do this by wiping the `MonitorEvent`s from the monitors and then reloading with the
3596+
// latest state.
3597+
let mon_events = nodes[1].chain_monitor.chain_monitor.release_pending_monitor_events();
3598+
assert_eq!(mon_events.len(), 1);
3599+
assert_eq!(mon_events[0].2.len(), 2);
3600+
3601+
let node_ser = nodes[1].node.encode();
3602+
let mon_a_ser = get_monitor!(nodes[1], chan_a).encode();
3603+
let mon_b_ser = get_monitor!(nodes[1], chan_b).encode();
3604+
let mons = &[&mon_a_ser[..], &mon_b_ser[..]];
3605+
reload_node!(nodes[1], cfg, &node_ser, mons, persister, new_chain_mon, node_b_reload);
3606+
3607+
let timeout_events = nodes[1].node.get_and_clear_pending_events();
3608+
assert_eq!(timeout_events.len(), 3, "{timeout_events:?}");
3609+
for ev in timeout_events {
3610+
match ev {
3611+
Event::PaymentPathFailed { payment_hash, .. } => {
3612+
assert_eq!(payment_hash, hash_b);
3613+
},
3614+
Event::PaymentFailed { payment_hash, .. } => {
3615+
assert_eq!(payment_hash, Some(hash_b));
3616+
},
3617+
Event::HTLCHandlingFailed { prev_channel_id, .. } => {
3618+
assert_eq!(prev_channel_id, chan_a);
3619+
},
3620+
_ => panic!("Wrong event {ev:?}"),
3621+
}
3622+
}
3623+
3624+
nodes[0].node.peer_disconnected(nodes[1].node.get_our_node_id());
3625+
3626+
let mut reconnect_args = ReconnectArgs::new(&nodes[0], &nodes[1]);
3627+
reconnect_args.pending_cell_htlc_fails = (0, 0);
3628+
reconnect_nodes(reconnect_args);
3629+
3630+
nodes[1].node.process_pending_htlc_forwards();
3631+
check_added_monitors(&nodes[1], 1);
3632+
let bs_fail = get_htlc_update_msgs(&nodes[1], &node_a_id);
3633+
nodes[0].node.handle_update_fail_htlc(node_b_id, &bs_fail.update_fail_htlcs[0]);
3634+
commitment_signed_dance!(nodes[0], nodes[1], bs_fail.commitment_signed, true, true);
3635+
expect_payment_failed!(nodes[0], hash_a, false);
3636+
}
3637+
3638+
#[test]
3639+
fn test_lost_timeout_monitor_events() {
3640+
do_test_lost_timeout_monitor_events(true);
3641+
do_test_lost_timeout_monitor_events(false);
3642+
}

0 commit comments

Comments
 (0)