Skip to content

Commit dc20515

Browse files
committed
Re-fail perm-failed HTLCs on startup in case of MonitorEvent loss
`MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets persisted (i.e. due to a block update) then the node crashes, prior to persisting the `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be lost. This isn't likely in a sync persist environment, but in an async one this could be an issue. Note that this is only an issue for closed channels - `MonitorEvent`s only inform the `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved on-chain post closure. Of the three, only the last is problematic to lose prior to a reload. In a previous commit we handled the case of claimed HTLCs by replaying payment preimages on startup to avoid `MonitorEvent` loss causing us to miss an HTLC claim. Here we handle the HTLC-failed case similarly. Unlike with HTLC claims via preimage, we don't already have replay logic in `ChannelManager` startup, but its easy enough to add one. Luckily, we already track when an HTLC reaches permanently-failed state in `ChannelMonitor` (i.e. it has `ANTI_REORG_DELAY` confirmations on-chain on the failing transaction), so all we need to do is add the ability to query for that and fail them on `ChannelManager` startup.
1 parent db6d73c commit dc20515

File tree

3 files changed

+268
-9
lines changed

3 files changed

+268
-9
lines changed

lightning/src/chain/channelmonitor.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2957,6 +2957,75 @@ impl<Signer: EcdsaChannelSigner> ChannelMonitor<Signer> {
29572957
res
29582958
}
29592959

2960+
/// Gets the set of outbound HTLCs which hit the chain and ultimately were claimed by us via
2961+
/// the timeout path and reached [`ANTI_REORG_DELAY`] confirmations. This is used to determine
2962+
/// if an HTLC has failed without the `ChannelManager` having seen it prior to being persisted.
2963+
pub(crate) fn get_onchain_failed_outbound_htlcs(
2964+
&self,
2965+
) -> HashMap<HTLCSource, HTLCOutputInCommitment> {
2966+
let us = self.inner.lock().unwrap();
2967+
// We're only concerned with the confirmation count of HTLC transactions, and don't
2968+
// actually care how many confirmations a commitment transaction may or may not have. Thus,
2969+
// we look for either a FundingSpendConfirmation event or a funding_spend_confirmed.
2970+
let confirmed_txid = us.funding_spend_confirmed.or_else(|| {
2971+
us.onchain_events_awaiting_threshold_conf.iter().find_map(|event| {
2972+
if let OnchainEvent::FundingSpendConfirmation { .. } = event.event {
2973+
Some(event.txid)
2974+
} else {
2975+
None
2976+
}
2977+
})
2978+
});
2979+
2980+
if confirmed_txid.is_none() {
2981+
return new_hash_map();
2982+
}
2983+
2984+
let mut res = new_hash_map();
2985+
macro_rules! walk_htlcs {
2986+
($holder_commitment: expr, $htlc_iter: expr) => {
2987+
for (htlc, source) in $htlc_iter {
2988+
let filter = |v: &&IrrevocablyResolvedHTLC| {
2989+
v.commitment_tx_output_idx == htlc.transaction_output_index
2990+
};
2991+
if let Some(state) = us.htlcs_resolved_on_chain.iter().filter(filter).next() {
2992+
if let Some(source) = source {
2993+
if state.payment_preimage.is_none() {
2994+
res.insert(source.clone(), htlc.clone());
2995+
}
2996+
}
2997+
}
2998+
}
2999+
};
3000+
}
3001+
3002+
let txid = confirmed_txid.unwrap();
3003+
if Some(txid) == us.funding.current_counterparty_commitment_txid
3004+
|| Some(txid) == us.funding.prev_counterparty_commitment_txid
3005+
{
3006+
walk_htlcs!(
3007+
false,
3008+
us.funding.counterparty_claimable_outpoints.get(&txid).unwrap().iter().filter_map(
3009+
|(a, b)| {
3010+
if let &Some(ref source) = b {
3011+
Some((a, Some(&**source)))
3012+
} else {
3013+
None
3014+
}
3015+
}
3016+
)
3017+
);
3018+
} else if txid == us.funding.current_holder_commitment_tx.trust().txid() {
3019+
walk_htlcs!(true, holder_commitment_htlcs!(us, CURRENT_WITH_SOURCES));
3020+
} else if let Some(prev_commitment_tx) = &us.funding.prev_holder_commitment_tx {
3021+
if txid == prev_commitment_tx.trust().txid() {
3022+
walk_htlcs!(true, holder_commitment_htlcs!(us, PREV_WITH_SOURCES).unwrap());
3023+
}
3024+
}
3025+
3026+
res
3027+
}
3028+
29603029
/// Gets the set of outbound HTLCs which are pending resolution in this channel or which were
29613030
/// resolved with a preimage from our counterparty.
29623031
///

lightning/src/ln/channelmanager.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15842,7 +15842,7 @@ where
1584215842
log_error!(logger, " The ChannelMonitor for channel {} is at counterparty commitment transaction number {} but the ChannelManager is at counterparty commitment transaction number {}.",
1584315843
&channel.context.channel_id(), monitor.get_cur_counterparty_commitment_number(), channel.get_cur_counterparty_commitment_transaction_number());
1584415844
}
15845-
let mut shutdown_result =
15845+
let shutdown_result =
1584615846
channel.force_shutdown(ClosureReason::OutdatedChannelManager);
1584715847
if shutdown_result.unbroadcasted_batch_funding_txid.is_some() {
1584815848
return Err(DecodeError::InvalidValue);
@@ -15874,7 +15874,10 @@ where
1587415874
},
1587515875
);
1587615876
}
15877-
failed_htlcs.append(&mut shutdown_result.dropped_outbound_htlcs);
15877+
for (source, hash, cp_id, chan_id) in shutdown_result.dropped_outbound_htlcs {
15878+
let reason = LocalHTLCFailureReason::ChannelClosed;
15879+
failed_htlcs.push((source, hash, cp_id, chan_id, reason));
15880+
}
1587815881
channel_closures.push_back((
1587915882
events::Event::ChannelClosed {
1588015883
channel_id: channel.context.channel_id(),
@@ -15916,6 +15919,7 @@ where
1591615919
*payment_hash,
1591715920
channel.context.get_counterparty_node_id(),
1591815921
channel.context.channel_id(),
15922+
LocalHTLCFailureReason::ChannelClosed,
1591915923
));
1592015924
}
1592115925
}
@@ -16640,6 +16644,20 @@ where
1664016644
},
1664116645
}
1664216646
}
16647+
for (htlc_source, htlc) in monitor.get_onchain_failed_outbound_htlcs() {
16648+
log_info!(
16649+
args.logger,
16650+
"Failing HTLC with payment hash {} as it was resolved on-chain.",
16651+
htlc.payment_hash
16652+
);
16653+
failed_htlcs.push((
16654+
htlc_source,
16655+
htlc.payment_hash,
16656+
monitor.get_counterparty_node_id(),
16657+
monitor.channel_id(),
16658+
LocalHTLCFailureReason::OnChainTimeout,
16659+
));
16660+
}
1664316661
}
1664416662

1664516663
// Whether the downstream channel was closed or not, try to re-apply any payment
@@ -17320,13 +17338,10 @@ where
1732017338
}
1732117339
}
1732217340

17323-
for htlc_source in failed_htlcs.drain(..) {
17324-
let (source, payment_hash, counterparty_node_id, channel_id) = htlc_source;
17325-
let failure_reason = LocalHTLCFailureReason::ChannelClosed;
17326-
let receiver = HTLCHandlingFailureType::Forward {
17327-
node_id: Some(counterparty_node_id),
17328-
channel_id,
17329-
};
17341+
for htlc_source in failed_htlcs {
17342+
let (source, payment_hash, counterparty_id, channel_id, failure_reason) = htlc_source;
17343+
let receiver =
17344+
HTLCHandlingFailureType::Forward { node_id: Some(counterparty_id), channel_id };
1733017345
let reason = HTLCFailReason::from_failure_code(failure_reason);
1733117346
channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
1733217347
}

lightning/src/ln/monitor_tests.rs

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3453,3 +3453,178 @@ fn test_lost_preimage_monitor_events() {
34533453
do_test_lost_preimage_monitor_events(true);
34543454
do_test_lost_preimage_monitor_events(false);
34553455
}
3456+
3457+
fn do_test_lost_timeout_monitor_events(on_counterparty_tx: bool) {
3458+
// `MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the
3459+
// `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets
3460+
// persisted (i.e. due to a block update) then the node crashes, prior to persisting the
3461+
// `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be
3462+
// lost. This isn't likely in a sync persist environment, but in an async one this could be an
3463+
// issue.
3464+
//
3465+
// Note that this is only an issue for closed channels - `MonitorEvent`s only inform the
3466+
// `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup
3467+
// or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes
3468+
// completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved
3469+
// on-chain post closure. Of the three, only the last is problematic to lose prior to a reload.
3470+
//
3471+
// Here we test that losing `MonitorEvent`s that contain HTLC resolution via timeouts does not
3472+
// cause us to lose a `PaymentFailed` event.
3473+
let mut cfg = test_default_channel_config();
3474+
cfg.manually_accept_inbound_channels = true;
3475+
cfg.channel_handshake_config.negotiate_anchors_zero_fee_htlc_tx = true;
3476+
let cfgs = [Some(cfg.clone()), Some(cfg.clone()), Some(cfg.clone())];
3477+
3478+
let chanmon_cfgs = create_chanmon_cfgs(3);
3479+
let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
3480+
let persister;
3481+
let new_chain_mon;
3482+
let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &cfgs);
3483+
let node_b_reload;
3484+
let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
3485+
3486+
provide_anchor_reserves(&nodes);
3487+
3488+
let node_a_id = nodes[0].node.get_our_node_id();
3489+
let node_b_id = nodes[1].node.get_our_node_id();
3490+
let node_c_id = nodes[2].node.get_our_node_id();
3491+
3492+
let chan_a = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 0).2;
3493+
let chan_b = create_announced_chan_between_nodes_with_value(&nodes, 1, 2, 1_000_000, 0).2;
3494+
3495+
// Ensure all nodes are at the same height
3496+
let node_max_height =
3497+
nodes.iter().map(|node| node.blocks.lock().unwrap().len()).max().unwrap() as u32;
3498+
connect_blocks(&nodes[0], node_max_height - nodes[0].best_block_info().1);
3499+
connect_blocks(&nodes[1], node_max_height - nodes[1].best_block_info().1);
3500+
connect_blocks(&nodes[2], node_max_height - nodes[2].best_block_info().1);
3501+
3502+
let (_, hash_a, ..) = route_payment(&nodes[0], &[&nodes[1], &nodes[2]], 5_000_000);
3503+
let (_, hash_b, ..) = route_payment(&nodes[1], &[&nodes[2]], 5_000_000);
3504+
3505+
nodes[1].node.peer_disconnected(nodes[2].node.get_our_node_id());
3506+
nodes[2].node.peer_disconnected(nodes[1].node.get_our_node_id());
3507+
3508+
// Force-close the channel, confirming a commitment transaction then letting C claim the HTLCs.
3509+
let message = "Closed".to_owned();
3510+
nodes[2]
3511+
.node
3512+
.force_close_broadcasting_latest_txn(&chan_b, &node_b_id, message.clone())
3513+
.unwrap();
3514+
check_added_monitors(&nodes[2], 1);
3515+
let c_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3516+
check_closed_event!(nodes[2], 1, c_reason, [node_b_id], 1_000_000);
3517+
check_closed_broadcast!(nodes[2], true);
3518+
3519+
let cs_commit_tx = nodes[2].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3520+
assert_eq!(cs_commit_tx.len(), 1);
3521+
3522+
let message = "Closed".to_owned();
3523+
nodes[1]
3524+
.node
3525+
.force_close_broadcasting_latest_txn(&chan_b, &node_c_id, message.clone())
3526+
.unwrap();
3527+
check_added_monitors(&nodes[1], 1);
3528+
let b_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3529+
check_closed_event!(nodes[1], 1, b_reason, [node_c_id], 1_000_000);
3530+
check_closed_broadcast!(nodes[1], true);
3531+
3532+
let bs_commit_tx = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3533+
assert_eq!(bs_commit_tx.len(), 1);
3534+
3535+
let selected_commit_tx = if on_counterparty_tx {
3536+
&cs_commit_tx[0]
3537+
} else {
3538+
&bs_commit_tx[0]
3539+
};
3540+
3541+
mine_transaction(&nodes[1], selected_commit_tx);
3542+
// If the block gets connected first we may re-broadcast B's commitment transaction before
3543+
// seeing the C's confirm.
3544+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clear();
3545+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3546+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3547+
if on_counterparty_tx {
3548+
assert_eq!(events.len(), 1, "{events:?}");
3549+
match events[0] {
3550+
Event::SpendableOutputs { .. } => {},
3551+
_ => panic!("Unexpected event {events:?}"),
3552+
}
3553+
} else {
3554+
assert_eq!(events.len(), 0);
3555+
}
3556+
3557+
connect_blocks(&nodes[1], TEST_FINAL_CLTV - ANTI_REORG_DELAY + 1);
3558+
if !on_counterparty_tx {
3559+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3560+
assert_eq!(events.len(), 1, "{events:?}");
3561+
match events.pop().unwrap() {
3562+
Event::BumpTransaction(bump_event) => {
3563+
nodes[1].bump_tx_handler.handle_event(&bump_event);
3564+
},
3565+
_ => panic!("Unexpected event"),
3566+
}
3567+
}
3568+
let bs_htlc_timeouts =
3569+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3570+
assert_eq!(bs_htlc_timeouts.len(), 1);
3571+
3572+
// Now replay the timeouts on node B, which after 6 confirmations should fail the HTLCs via
3573+
// `MonitorUpdate`s
3574+
mine_transactions(&nodes[1], &bs_htlc_timeouts.iter().collect::<Vec<_>>());
3575+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3576+
3577+
// Now simulate a restart where the B<->C ChannelMonitor has been persisted (i.e. because we
3578+
// just processed a new block) but the ChannelManager was not. This should be exceedingly rare
3579+
// given we have to be connecting a block at the right moment and not manage to get a
3580+
// ChannelManager persisted after it does a thing that should immediately precede persistence,
3581+
// but with async persist it is more common.
3582+
//
3583+
// We do this by wiping the `MonitorEvent`s from the monitors and then reloading with the
3584+
// latest state.
3585+
let mon_events = nodes[1].chain_monitor.chain_monitor.release_pending_monitor_events();
3586+
assert_eq!(mon_events.len(), 1);
3587+
assert_eq!(mon_events[0].2.len(), 2);
3588+
3589+
let node_ser = nodes[1].node.encode();
3590+
let mon_a_ser = get_monitor!(nodes[1], chan_a).encode();
3591+
let mon_b_ser = get_monitor!(nodes[1], chan_b).encode();
3592+
let mons = &[&mon_a_ser[..], &mon_b_ser[..]];
3593+
reload_node!(nodes[1], cfg, &node_ser, mons, persister, new_chain_mon, node_b_reload);
3594+
3595+
let timeout_events = nodes[1].node.get_and_clear_pending_events();
3596+
assert_eq!(timeout_events.len(), 3, "{timeout_events:?}");
3597+
for ev in timeout_events {
3598+
match ev {
3599+
Event::PaymentPathFailed { payment_hash, .. } => {
3600+
assert_eq!(payment_hash, hash_b);
3601+
},
3602+
Event::PaymentFailed { payment_hash, .. } => {
3603+
assert_eq!(payment_hash, Some(hash_b));
3604+
},
3605+
Event::HTLCHandlingFailed { prev_channel_id, .. } => {
3606+
assert_eq!(prev_channel_id, chan_a);
3607+
},
3608+
_ => panic!("Wrong event {ev:?}"),
3609+
}
3610+
}
3611+
3612+
nodes[0].node.peer_disconnected(nodes[1].node.get_our_node_id());
3613+
3614+
let mut reconnect_args = ReconnectArgs::new(&nodes[0], &nodes[1]);
3615+
reconnect_args.pending_cell_htlc_fails = (0, 0);
3616+
reconnect_nodes(reconnect_args);
3617+
3618+
nodes[1].node.process_pending_htlc_forwards();
3619+
check_added_monitors(&nodes[1], 1);
3620+
let bs_fail = get_htlc_update_msgs(&nodes[1], &node_a_id);
3621+
nodes[0].node.handle_update_fail_htlc(node_b_id, &bs_fail.update_fail_htlcs[0]);
3622+
commitment_signed_dance!(nodes[0], nodes[1], bs_fail.commitment_signed, true, true);
3623+
expect_payment_failed!(nodes[0], hash_a, false);
3624+
}
3625+
3626+
#[test]
3627+
fn test_lost_timeout_monitor_events() {
3628+
do_test_lost_timeout_monitor_events(true);
3629+
do_test_lost_timeout_monitor_events(false);
3630+
}

0 commit comments

Comments
 (0)