Skip to content

Commit 86a58b6

Browse files
committed
Re-fail perm-failed HTLCs on startup in case of MonitorEvent loss
`MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets persisted (i.e. due to a block update) then the node crashes, prior to persisting the `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be lost. This isn't likely in a sync persist environment, but in an async one this could be an issue. Note that this is only an issue for closed channels - `MonitorEvent`s only inform the `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved on-chain post closure. Of the three, only the last is problematic to lose prior to a reload. In a previous commit we handled the case of claimed HTLCs by replaying payment preimages on startup to avoid `MonitorEvent` loss causing us to miss an HTLC claim. Here we handle the HTLC-failed case similarly. Unlike with HTLC claims via preimage, we don't already have replay logic in `ChannelManager` startup, but its easy enough to add one. Luckily, we already track when an HTLC reaches permanently-failed state in `ChannelMonitor` (i.e. it has `ANTI_REORG_DELAY` confirmations on-chain on the failing transaction), so all we need to do is add the ability to query for that and fail them on `ChannelManager` startup.
1 parent a3abac3 commit 86a58b6

File tree

3 files changed

+270
-9
lines changed

3 files changed

+270
-9
lines changed

lightning/src/chain/channelmonitor.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2957,6 +2957,75 @@ impl<Signer: EcdsaChannelSigner> ChannelMonitor<Signer> {
29572957
res
29582958
}
29592959

2960+
/// Gets the set of outbound HTLCs which hit the chain and ultimately were claimed by us via
2961+
/// the timeout path and reached [`ANTI_REORG_DELAY`] confirmations. This is used to determine
2962+
/// if an HTLC has failed without the `ChannelManager` having seen it prior to being persisted.
2963+
pub(crate) fn get_onchain_failed_outbound_htlcs(
2964+
&self,
2965+
) -> HashMap<HTLCSource, HTLCOutputInCommitment> {
2966+
let us = self.inner.lock().unwrap();
2967+
// We're only concerned with the confirmation count of HTLC transactions, and don't
2968+
// actually care how many confirmations a commitment transaction may or may not have. Thus,
2969+
// we look for either a FundingSpendConfirmation event or a funding_spend_confirmed.
2970+
let confirmed_txid = us.funding_spend_confirmed.or_else(|| {
2971+
us.onchain_events_awaiting_threshold_conf.iter().find_map(|event| {
2972+
if let OnchainEvent::FundingSpendConfirmation { .. } = event.event {
2973+
Some(event.txid)
2974+
} else {
2975+
None
2976+
}
2977+
})
2978+
});
2979+
2980+
if confirmed_txid.is_none() {
2981+
return new_hash_map();
2982+
}
2983+
2984+
let mut res = new_hash_map();
2985+
macro_rules! walk_htlcs {
2986+
($holder_commitment: expr, $htlc_iter: expr) => {
2987+
for (htlc, source) in $htlc_iter {
2988+
let filter = |v: &&IrrevocablyResolvedHTLC| {
2989+
v.commitment_tx_output_idx == htlc.transaction_output_index
2990+
};
2991+
if let Some(state) = us.htlcs_resolved_on_chain.iter().filter(filter).next() {
2992+
if let Some(source) = source {
2993+
if state.payment_preimage.is_none() {
2994+
res.insert(source.clone(), htlc.clone());
2995+
}
2996+
}
2997+
}
2998+
}
2999+
};
3000+
}
3001+
3002+
let txid = confirmed_txid.unwrap();
3003+
if Some(txid) == us.funding.current_counterparty_commitment_txid
3004+
|| Some(txid) == us.funding.prev_counterparty_commitment_txid
3005+
{
3006+
walk_htlcs!(
3007+
false,
3008+
us.funding.counterparty_claimable_outpoints.get(&txid).unwrap().iter().filter_map(
3009+
|(a, b)| {
3010+
if let &Some(ref source) = b {
3011+
Some((a, Some(&**source)))
3012+
} else {
3013+
None
3014+
}
3015+
}
3016+
)
3017+
);
3018+
} else if txid == us.funding.current_holder_commitment_tx.trust().txid() {
3019+
walk_htlcs!(true, holder_commitment_htlcs!(us, CURRENT_WITH_SOURCES));
3020+
} else if let Some(prev_commitment_tx) = &us.funding.prev_holder_commitment_tx {
3021+
if txid == prev_commitment_tx.trust().txid() {
3022+
walk_htlcs!(true, holder_commitment_htlcs!(us, PREV_WITH_SOURCES).unwrap());
3023+
}
3024+
}
3025+
3026+
res
3027+
}
3028+
29603029
/// Gets the set of outbound HTLCs which are pending resolution in this channel or which were
29613030
/// resolved with a preimage from our counterparty.
29623031
///

lightning/src/ln/channelmanager.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15842,7 +15842,7 @@ where
1584215842
log_error!(logger, " The ChannelMonitor for channel {} is at counterparty commitment transaction number {} but the ChannelManager is at counterparty commitment transaction number {}.",
1584315843
&channel.context.channel_id(), monitor.get_cur_counterparty_commitment_number(), channel.get_cur_counterparty_commitment_transaction_number());
1584415844
}
15845-
let mut shutdown_result =
15845+
let shutdown_result =
1584615846
channel.force_shutdown(ClosureReason::OutdatedChannelManager);
1584715847
if shutdown_result.unbroadcasted_batch_funding_txid.is_some() {
1584815848
return Err(DecodeError::InvalidValue);
@@ -15874,7 +15874,10 @@ where
1587415874
},
1587515875
);
1587615876
}
15877-
failed_htlcs.append(&mut shutdown_result.dropped_outbound_htlcs);
15877+
for (source, hash, cp_id, chan_id) in shutdown_result.dropped_outbound_htlcs {
15878+
let reason = LocalHTLCFailureReason::ChannelClosed;
15879+
failed_htlcs.push((source, hash, cp_id, chan_id, reason));
15880+
}
1587815881
channel_closures.push_back((
1587915882
events::Event::ChannelClosed {
1588015883
channel_id: channel.context.channel_id(),
@@ -15916,6 +15919,7 @@ where
1591615919
*payment_hash,
1591715920
channel.context.get_counterparty_node_id(),
1591815921
channel.context.channel_id(),
15922+
LocalHTLCFailureReason::ChannelClosed,
1591915923
));
1592015924
}
1592115925
}
@@ -16640,6 +16644,20 @@ where
1664016644
},
1664116645
}
1664216646
}
16647+
for (htlc_source, htlc) in monitor.get_onchain_failed_outbound_htlcs() {
16648+
log_info!(
16649+
args.logger,
16650+
"Failing HTLC with payment hash {} as it was resolved on-chain.",
16651+
htlc.payment_hash
16652+
);
16653+
failed_htlcs.push((
16654+
htlc_source,
16655+
htlc.payment_hash,
16656+
monitor.get_counterparty_node_id(),
16657+
monitor.channel_id(),
16658+
LocalHTLCFailureReason::OnChainTimeout,
16659+
));
16660+
}
1664316661
}
1664416662

1664516663
// Whether the downstream channel was closed or not, try to re-apply any payment
@@ -17320,13 +17338,10 @@ where
1732017338
}
1732117339
}
1732217340

17323-
for htlc_source in failed_htlcs.drain(..) {
17324-
let (source, payment_hash, counterparty_node_id, channel_id) = htlc_source;
17325-
let failure_reason = LocalHTLCFailureReason::ChannelClosed;
17326-
let receiver = HTLCHandlingFailureType::Forward {
17327-
node_id: Some(counterparty_node_id),
17328-
channel_id,
17329-
};
17341+
for htlc_source in failed_htlcs {
17342+
let (source, payment_hash, counterparty_id, channel_id, failure_reason) = htlc_source;
17343+
let receiver =
17344+
HTLCHandlingFailureType::Forward { node_id: Some(counterparty_id), channel_id };
1733017345
let reason = HTLCFailReason::from_failure_code(failure_reason);
1733117346
channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
1733217347
}

lightning/src/ln/monitor_tests.rs

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3455,3 +3455,180 @@ fn test_lost_preimage_monitor_events() {
34553455
do_test_lost_preimage_monitor_events(true);
34563456
do_test_lost_preimage_monitor_events(false);
34573457
}
3458+
3459+
fn do_test_lost_timeout_monitor_events(on_counterparty_tx: bool) {
3460+
// `MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the
3461+
// `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets
3462+
// persisted (i.e. due to a block update) then the node crashes, prior to persisting the
3463+
// `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be
3464+
// lost. This isn't likely in a sync persist environment, but in an async one this could be an
3465+
// issue.
3466+
//
3467+
// Note that this is only an issue for closed channels - `MonitorEvent`s only inform the
3468+
// `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup
3469+
// or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes
3470+
// completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved
3471+
// on-chain post closure. Of the three, only the last is problematic to lose prior to a reload.
3472+
//
3473+
// Here we test that losing `MonitorEvent`s that contain HTLC resolution via timeouts does not
3474+
// cause us to lose a `PaymentFailed` event.
3475+
let mut cfg = test_default_channel_config();
3476+
cfg.manually_accept_inbound_channels = true;
3477+
cfg.channel_handshake_config.negotiate_anchors_zero_fee_htlc_tx = true;
3478+
let cfgs = [Some(cfg.clone()), Some(cfg.clone()), Some(cfg.clone())];
3479+
3480+
let chanmon_cfgs = create_chanmon_cfgs(3);
3481+
let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
3482+
let persister;
3483+
let new_chain_mon;
3484+
let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &cfgs);
3485+
let node_b_reload;
3486+
let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
3487+
3488+
provide_anchor_reserves(&nodes);
3489+
3490+
let node_a_id = nodes[0].node.get_our_node_id();
3491+
let node_b_id = nodes[1].node.get_our_node_id();
3492+
let node_c_id = nodes[2].node.get_our_node_id();
3493+
3494+
let chan_a = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 0).2;
3495+
let chan_b = create_announced_chan_between_nodes_with_value(&nodes, 1, 2, 1_000_000, 0).2;
3496+
3497+
// Ensure all nodes are at the same height
3498+
let node_max_height =
3499+
nodes.iter().map(|node| node.blocks.lock().unwrap().len()).max().unwrap() as u32;
3500+
connect_blocks(&nodes[0], node_max_height - nodes[0].best_block_info().1);
3501+
connect_blocks(&nodes[1], node_max_height - nodes[1].best_block_info().1);
3502+
connect_blocks(&nodes[2], node_max_height - nodes[2].best_block_info().1);
3503+
3504+
let (_, hash_a, ..) = route_payment(&nodes[0], &[&nodes[1], &nodes[2]], 5_000_000);
3505+
let (_, hash_b, ..) = route_payment(&nodes[1], &[&nodes[2]], 5_000_000);
3506+
3507+
nodes[1].node.peer_disconnected(nodes[2].node.get_our_node_id());
3508+
nodes[2].node.peer_disconnected(nodes[1].node.get_our_node_id());
3509+
3510+
// Force-close the channel, confirming a commitment transaction then letting C claim the HTLCs.
3511+
let message = "Closed".to_owned();
3512+
nodes[2]
3513+
.node
3514+
.force_close_broadcasting_latest_txn(&chan_b, &node_b_id, message.clone())
3515+
.unwrap();
3516+
check_added_monitors(&nodes[2], 1);
3517+
let c_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3518+
check_closed_event!(nodes[2], 1, c_reason, [node_b_id], 1_000_000);
3519+
check_closed_broadcast!(nodes[2], true);
3520+
3521+
handle_bump_events(&nodes[2], true, 0);
3522+
let cs_commit_tx = nodes[2].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3523+
assert_eq!(cs_commit_tx.len(), 1);
3524+
3525+
let message = "Closed".to_owned();
3526+
nodes[1]
3527+
.node
3528+
.force_close_broadcasting_latest_txn(&chan_b, &node_c_id, message.clone())
3529+
.unwrap();
3530+
check_added_monitors(&nodes[1], 1);
3531+
let b_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
3532+
check_closed_event!(nodes[1], 1, b_reason, [node_c_id], 1_000_000);
3533+
check_closed_broadcast!(nodes[1], true);
3534+
3535+
handle_bump_events(&nodes[1], true, 0);
3536+
let bs_commit_tx = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3537+
assert_eq!(bs_commit_tx.len(), 1);
3538+
3539+
let selected_commit_tx = if on_counterparty_tx {
3540+
&cs_commit_tx[0]
3541+
} else {
3542+
&bs_commit_tx[0]
3543+
};
3544+
3545+
mine_transaction(&nodes[1], selected_commit_tx);
3546+
// If the block gets connected first we may re-broadcast B's commitment transaction before
3547+
// seeing the C's confirm.
3548+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clear();
3549+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3550+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3551+
if on_counterparty_tx {
3552+
assert_eq!(events.len(), 1, "{events:?}");
3553+
match events[0] {
3554+
Event::SpendableOutputs { .. } => {},
3555+
_ => panic!("Unexpected event {events:?}"),
3556+
}
3557+
} else {
3558+
assert_eq!(events.len(), 0);
3559+
}
3560+
3561+
connect_blocks(&nodes[1], TEST_FINAL_CLTV - ANTI_REORG_DELAY + 1);
3562+
if !on_counterparty_tx {
3563+
let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
3564+
assert_eq!(events.len(), 1, "{events:?}");
3565+
match events.pop().unwrap() {
3566+
Event::BumpTransaction(bump_event) => {
3567+
nodes[1].bump_tx_handler.handle_event(&bump_event);
3568+
},
3569+
_ => panic!("Unexpected event"),
3570+
}
3571+
}
3572+
let bs_htlc_timeouts =
3573+
nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
3574+
assert_eq!(bs_htlc_timeouts.len(), 1);
3575+
3576+
// Now replay the timeouts on node B, which after 6 confirmations should fail the HTLCs via
3577+
// `MonitorUpdate`s
3578+
mine_transactions(&nodes[1], &bs_htlc_timeouts.iter().collect::<Vec<_>>());
3579+
connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
3580+
3581+
// Now simulate a restart where the B<->C ChannelMonitor has been persisted (i.e. because we
3582+
// just processed a new block) but the ChannelManager was not. This should be exceedingly rare
3583+
// given we have to be connecting a block at the right moment and not manage to get a
3584+
// ChannelManager persisted after it does a thing that should immediately precede persistence,
3585+
// but with async persist it is more common.
3586+
//
3587+
// We do this by wiping the `MonitorEvent`s from the monitors and then reloading with the
3588+
// latest state.
3589+
let mon_events = nodes[1].chain_monitor.chain_monitor.release_pending_monitor_events();
3590+
assert_eq!(mon_events.len(), 1);
3591+
assert_eq!(mon_events[0].2.len(), 2);
3592+
3593+
let node_ser = nodes[1].node.encode();
3594+
let mon_a_ser = get_monitor!(nodes[1], chan_a).encode();
3595+
let mon_b_ser = get_monitor!(nodes[1], chan_b).encode();
3596+
let mons = &[&mon_a_ser[..], &mon_b_ser[..]];
3597+
reload_node!(nodes[1], cfg, &node_ser, mons, persister, new_chain_mon, node_b_reload);
3598+
3599+
let timeout_events = nodes[1].node.get_and_clear_pending_events();
3600+
assert_eq!(timeout_events.len(), 3, "{timeout_events:?}");
3601+
for ev in timeout_events {
3602+
match ev {
3603+
Event::PaymentPathFailed { payment_hash, .. } => {
3604+
assert_eq!(payment_hash, hash_b);
3605+
},
3606+
Event::PaymentFailed { payment_hash, .. } => {
3607+
assert_eq!(payment_hash, Some(hash_b));
3608+
},
3609+
Event::HTLCHandlingFailed { prev_channel_id, .. } => {
3610+
assert_eq!(prev_channel_id, chan_a);
3611+
},
3612+
_ => panic!("Wrong event {ev:?}"),
3613+
}
3614+
}
3615+
3616+
nodes[0].node.peer_disconnected(nodes[1].node.get_our_node_id());
3617+
3618+
let mut reconnect_args = ReconnectArgs::new(&nodes[0], &nodes[1]);
3619+
reconnect_args.pending_cell_htlc_fails = (0, 0);
3620+
reconnect_nodes(reconnect_args);
3621+
3622+
nodes[1].node.process_pending_htlc_forwards();
3623+
check_added_monitors(&nodes[1], 1);
3624+
let bs_fail = get_htlc_update_msgs(&nodes[1], &node_a_id);
3625+
nodes[0].node.handle_update_fail_htlc(node_b_id, &bs_fail.update_fail_htlcs[0]);
3626+
commitment_signed_dance!(nodes[0], nodes[1], bs_fail.commitment_signed, true, true);
3627+
expect_payment_failed!(nodes[0], hash_a, false);
3628+
}
3629+
3630+
#[test]
3631+
fn test_lost_timeout_monitor_events() {
3632+
do_test_lost_timeout_monitor_events(true);
3633+
do_test_lost_timeout_monitor_events(false);
3634+
}

0 commit comments

Comments
 (0)