Re-fail perm-failed HTLCs on startup in case of MonitorEvent loss

TheBlueMatt · TheBlueMatt · commit dc20515fa86f · 2025-08-11T20:18:38.000Z
`MonitorEvent`s aren't delivered to the `ChannelManager` in a
durable fasion - if the `ChannelManager` fetches the pending
`MonitorEvent`s, then the `ChannelMonitor` gets persisted (i.e. due
to a block update) then the node crashes, prior to persisting the
`ChannelManager` again, the `MonitorEvent` and its effects on the
`ChannelManger` will be lost. This isn't likely in a sync persist
environment, but in an async one this could be an issue.

Note that this is only an issue for closed channels -
`MonitorEvent`s only inform the `ChannelManager` that a channel is
closed (which the `ChannelManager` will learn on startup or when it
next tries to advance the channel state), that
`ChannelMonitorUpdate` writes completed (which the `ChannelManager`
will detect on startup), or that HTLCs resolved on-chain post
closure. Of the three, only the last is problematic to lose prior
to a reload.

In a previous commit we handled the case of claimed HTLCs by
replaying payment preimages on startup to avoid `MonitorEvent` loss
causing us to miss an HTLC claim. Here we handle the HTLC-failed
case similarly.

Unlike with HTLC claims via preimage, we don't already have replay
logic in `ChannelManager` startup, but its easy enough to add one.
Luckily, we already track when an HTLC reaches permanently-failed
state in `ChannelMonitor` (i.e. it has `ANTI_REORG_DELAY`
confirmations on-chain on the failing transaction), so all we need
to do is add the ability to query for that and fail them on
`ChannelManager` startup.
diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs
@@ -2957,6 +2957,75 @@ impl<Signer: EcdsaChannelSigner> ChannelMonitor<Signer> {
 		res
 	}
 
+	/// Gets the set of outbound HTLCs which hit the chain and ultimately were claimed by us via
+	/// the timeout path and reached [`ANTI_REORG_DELAY`] confirmations. This is used to determine
+	/// if an HTLC has failed without the `ChannelManager` having seen it prior to being persisted.
+	pub(crate) fn get_onchain_failed_outbound_htlcs(
+		&self,
+	) -> HashMap<HTLCSource, HTLCOutputInCommitment> {
+		let us = self.inner.lock().unwrap();
+		// We're only concerned with the confirmation count of HTLC transactions, and don't
+		// actually care how many confirmations a commitment transaction may or may not have. Thus,
+		// we look for either a FundingSpendConfirmation event or a funding_spend_confirmed.
+		let confirmed_txid = us.funding_spend_confirmed.or_else(|| {
+			us.onchain_events_awaiting_threshold_conf.iter().find_map(|event| {
+				if let OnchainEvent::FundingSpendConfirmation { .. } = event.event {
+					Some(event.txid)
+				} else {
+					None
+				}
+			})
+		});
+
+		if confirmed_txid.is_none() {
+			return new_hash_map();
+		}
+
+		let mut res = new_hash_map();
+		macro_rules! walk_htlcs {
+			($holder_commitment: expr, $htlc_iter: expr) => {
+				for (htlc, source) in $htlc_iter {
+					let filter = |v: &&IrrevocablyResolvedHTLC| {
+						v.commitment_tx_output_idx == htlc.transaction_output_index
+					};
+					if let Some(state) = us.htlcs_resolved_on_chain.iter().filter(filter).next() {
+						if let Some(source) = source {
+							if state.payment_preimage.is_none() {
+								res.insert(source.clone(), htlc.clone());
+							}
+						}
+					}
+				}
+			};
+		}
+
+		let txid = confirmed_txid.unwrap();
+		if Some(txid) == us.funding.current_counterparty_commitment_txid
+			|| Some(txid) == us.funding.prev_counterparty_commitment_txid
+		{
+			walk_htlcs!(
+				false,
+				us.funding.counterparty_claimable_outpoints.get(&txid).unwrap().iter().filter_map(
+					|(a, b)| {
+						if let &Some(ref source) = b {
+							Some((a, Some(&**source)))
+						} else {
+							None
+						}
+					}
+				)
+			);
+		} else if txid == us.funding.current_holder_commitment_tx.trust().txid() {
+			walk_htlcs!(true, holder_commitment_htlcs!(us, CURRENT_WITH_SOURCES));
+		} else if let Some(prev_commitment_tx) = &us.funding.prev_holder_commitment_tx {
+			if txid == prev_commitment_tx.trust().txid() {
+				walk_htlcs!(true, holder_commitment_htlcs!(us, PREV_WITH_SOURCES).unwrap());
+			}
+		}
+
+		res
+	}
+
 	/// Gets the set of outbound HTLCs which are pending resolution in this channel or which were
 	/// resolved with a preimage from our counterparty.
 	///
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -15842,7 +15842,7 @@ where
 						log_error!(logger, " The ChannelMonitor for channel {} is at counterparty commitment transaction number {} but the ChannelManager is at counterparty commitment transaction number {}.",
 							&channel.context.channel_id(), monitor.get_cur_counterparty_commitment_number(), channel.get_cur_counterparty_commitment_transaction_number());
 					}
-					let mut shutdown_result =
+					let shutdown_result =
 						channel.force_shutdown(ClosureReason::OutdatedChannelManager);
 					if shutdown_result.unbroadcasted_batch_funding_txid.is_some() {
 						return Err(DecodeError::InvalidValue);
@@ -15874,7 +15874,10 @@ where
 							},
 						);
 					}
-					failed_htlcs.append(&mut shutdown_result.dropped_outbound_htlcs);
+					for (source, hash, cp_id, chan_id) in shutdown_result.dropped_outbound_htlcs {
+						let reason = LocalHTLCFailureReason::ChannelClosed;
+						failed_htlcs.push((source, hash, cp_id, chan_id, reason));
+					}
 					channel_closures.push_back((
 						events::Event::ChannelClosed {
 							channel_id: channel.context.channel_id(),
@@ -15916,6 +15919,7 @@ where
 								*payment_hash,
 								channel.context.get_counterparty_node_id(),
 								channel.context.channel_id(),
+								LocalHTLCFailureReason::ChannelClosed,
 							));
 						}
 					}
@@ -16640,6 +16644,20 @@ where
 							},
 						}
 					}
+					for (htlc_source, htlc) in monitor.get_onchain_failed_outbound_htlcs() {
+						log_info!(
+							args.logger,
+							"Failing HTLC with payment hash {} as it was resolved on-chain.",
+							htlc.payment_hash
+						);
+						failed_htlcs.push((
+							htlc_source,
+							htlc.payment_hash,
+							monitor.get_counterparty_node_id(),
+							monitor.channel_id(),
+							LocalHTLCFailureReason::OnChainTimeout,
+						));
+					}
 				}
 
 				// Whether the downstream channel was closed or not, try to re-apply any payment
@@ -17320,13 +17338,10 @@ where
 			}
 		}
 
-		for htlc_source in failed_htlcs.drain(..) {
-			let (source, payment_hash, counterparty_node_id, channel_id) = htlc_source;
-			let failure_reason = LocalHTLCFailureReason::ChannelClosed;
-			let receiver = HTLCHandlingFailureType::Forward {
-				node_id: Some(counterparty_node_id),
-				channel_id,
-			};
+		for htlc_source in failed_htlcs {
+			let (source, payment_hash, counterparty_id, channel_id, failure_reason) = htlc_source;
+			let receiver =
+				HTLCHandlingFailureType::Forward { node_id: Some(counterparty_id), channel_id };
 			let reason = HTLCFailReason::from_failure_code(failure_reason);
 			channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
 		}
diff --git a/lightning/src/ln/monitor_tests.rs b/lightning/src/ln/monitor_tests.rs
@@ -3453,3 +3453,178 @@ fn test_lost_preimage_monitor_events() {
 	do_test_lost_preimage_monitor_events(true);
 	do_test_lost_preimage_monitor_events(false);
 }
+
+fn do_test_lost_timeout_monitor_events(on_counterparty_tx: bool) {
+	// `MonitorEvent`s aren't delivered to the `ChannelManager` in a durable fasion - if the
+	// `ChannelManager` fetches the pending `MonitorEvent`s, then the `ChannelMonitor` gets
+	// persisted (i.e. due to a block update) then the node crashes, prior to persisting the
+	// `ChannelManager` again, the `MonitorEvent` and its effects on the `ChannelManger` will be
+	// lost. This isn't likely in a sync persist environment, but in an async one this could be an
+	// issue.
+	//
+	// Note that this is only an issue for closed channels - `MonitorEvent`s only inform the
+	// `ChannelManager` that a channel is closed (which the `ChannelManager` will learn on startup
+	// or when it next tries to advance the channel state), that `ChannelMonitorUpdate` writes
+	// completed (which the `ChannelManager` will detect on startup), or that HTLCs resolved
+	// on-chain post closure. Of the three, only the last is problematic to lose prior to a reload.
+	//
+	// Here we test that losing `MonitorEvent`s that contain HTLC resolution via timeouts does not
+	// cause us to lose a `PaymentFailed` event.
+	let mut cfg = test_default_channel_config();
+	cfg.manually_accept_inbound_channels = true;
+	cfg.channel_handshake_config.negotiate_anchors_zero_fee_htlc_tx = true;
+	let cfgs = [Some(cfg.clone()), Some(cfg.clone()), Some(cfg.clone())];
+
+	let chanmon_cfgs = create_chanmon_cfgs(3);
+	let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
+	let persister;
+	let new_chain_mon;
+	let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &cfgs);
+	let node_b_reload;
+	let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
+
+	provide_anchor_reserves(&nodes);
+
+	let node_a_id = nodes[0].node.get_our_node_id();
+	let node_b_id = nodes[1].node.get_our_node_id();
+	let node_c_id = nodes[2].node.get_our_node_id();
+
+	let chan_a = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 0).2;
+	let chan_b = create_announced_chan_between_nodes_with_value(&nodes, 1, 2, 1_000_000, 0).2;
+
+	// Ensure all nodes are at the same height
+	let node_max_height =
+		nodes.iter().map(|node| node.blocks.lock().unwrap().len()).max().unwrap() as u32;
+	connect_blocks(&nodes[0], node_max_height - nodes[0].best_block_info().1);
+	connect_blocks(&nodes[1], node_max_height - nodes[1].best_block_info().1);
+	connect_blocks(&nodes[2], node_max_height - nodes[2].best_block_info().1);
+
+	let (_, hash_a, ..) = route_payment(&nodes[0], &[&nodes[1], &nodes[2]], 5_000_000);
+	let (_, hash_b, ..) = route_payment(&nodes[1], &[&nodes[2]], 5_000_000);
+
+	nodes[1].node.peer_disconnected(nodes[2].node.get_our_node_id());
+	nodes[2].node.peer_disconnected(nodes[1].node.get_our_node_id());
+
+	// Force-close the channel, confirming a commitment transaction then letting C claim the HTLCs.
+	let message = "Closed".to_owned();
+	nodes[2]
+		.node
+		.force_close_broadcasting_latest_txn(&chan_b, &node_b_id, message.clone())
+		.unwrap();
+	check_added_monitors(&nodes[2], 1);
+	let c_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
+	check_closed_event!(nodes[2], 1, c_reason, [node_b_id], 1_000_000);
+	check_closed_broadcast!(nodes[2], true);
+
+	let cs_commit_tx = nodes[2].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
+	assert_eq!(cs_commit_tx.len(), 1);
+
+	let message = "Closed".to_owned();
+	nodes[1]
+		.node
+		.force_close_broadcasting_latest_txn(&chan_b, &node_c_id, message.clone())
+		.unwrap();
+	check_added_monitors(&nodes[1], 1);
+	let b_reason = ClosureReason::HolderForceClosed { broadcasted_latest_txn: Some(true), message };
+	check_closed_event!(nodes[1], 1, b_reason, [node_c_id], 1_000_000);
+	check_closed_broadcast!(nodes[1], true);
+
+	let bs_commit_tx = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
+	assert_eq!(bs_commit_tx.len(), 1);
+
+	let selected_commit_tx = if on_counterparty_tx {
+		&cs_commit_tx[0]
+	} else {
+		&bs_commit_tx[0]
+	};
+
+	mine_transaction(&nodes[1], selected_commit_tx);
+	// If the block gets connected first we may re-broadcast B's commitment transaction before
+	// seeing the C's confirm.
+	nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clear();
+	connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
+	let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
+	if on_counterparty_tx {
+		assert_eq!(events.len(), 1, "{events:?}");
+		match events[0] {
+			Event::SpendableOutputs { .. } => {},
+			_ => panic!("Unexpected event {events:?}"),
+		}
+	} else {
+		assert_eq!(events.len(), 0);
+	}
+
+	connect_blocks(&nodes[1], TEST_FINAL_CLTV - ANTI_REORG_DELAY + 1);
+	if !on_counterparty_tx {
+		let mut events = nodes[1].chain_monitor.chain_monitor.get_and_clear_pending_events();
+		assert_eq!(events.len(), 1, "{events:?}");
+		match events.pop().unwrap() {
+			Event::BumpTransaction(bump_event) => {
+				nodes[1].bump_tx_handler.handle_event(&bump_event);
+			},
+			_ => panic!("Unexpected event"),
+		}
+	}
+	let bs_htlc_timeouts =
+		nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0);
+	assert_eq!(bs_htlc_timeouts.len(), 1);
+
+	// Now replay the timeouts on node B, which after 6 confirmations should fail the HTLCs via
+	// `MonitorUpdate`s
+	mine_transactions(&nodes[1], &bs_htlc_timeouts.iter().collect::<Vec<_>>());
+	connect_blocks(&nodes[1], ANTI_REORG_DELAY - 1);
+
+	// Now simulate a restart where the B<->C ChannelMonitor has been persisted (i.e. because we
+	// just processed a new block) but the ChannelManager was not. This should be exceedingly rare
+	// given we have to be connecting a block at the right moment and not manage to get a
+	// ChannelManager persisted after it does a thing that should immediately precede persistence,
+	// but with async persist it is more common.
+	//
+	// We do this by wiping the `MonitorEvent`s from the monitors and then reloading with the
+	// latest state.
+	let mon_events = nodes[1].chain_monitor.chain_monitor.release_pending_monitor_events();
+	assert_eq!(mon_events.len(), 1);
+	assert_eq!(mon_events[0].2.len(), 2);
+
+	let node_ser = nodes[1].node.encode();
+	let mon_a_ser = get_monitor!(nodes[1], chan_a).encode();
+	let mon_b_ser = get_monitor!(nodes[1], chan_b).encode();
+	let mons = &[&mon_a_ser[..], &mon_b_ser[..]];
+	reload_node!(nodes[1], cfg, &node_ser, mons, persister, new_chain_mon, node_b_reload);
+
+	let timeout_events = nodes[1].node.get_and_clear_pending_events();
+	assert_eq!(timeout_events.len(), 3, "{timeout_events:?}");
+	for ev in timeout_events {
+		match ev {
+			Event::PaymentPathFailed { payment_hash, .. } => {
+				assert_eq!(payment_hash, hash_b);
+			},
+			Event::PaymentFailed { payment_hash, .. } => {
+				assert_eq!(payment_hash, Some(hash_b));
+			},
+			Event::HTLCHandlingFailed { prev_channel_id, .. } => {
+				assert_eq!(prev_channel_id, chan_a);
+			},
+			_ => panic!("Wrong event {ev:?}"),
+		}
+	}
+
+	nodes[0].node.peer_disconnected(nodes[1].node.get_our_node_id());
+
+	let mut reconnect_args = ReconnectArgs::new(&nodes[0], &nodes[1]);
+	reconnect_args.pending_cell_htlc_fails = (0, 0);
+	reconnect_nodes(reconnect_args);
+
+	nodes[1].node.process_pending_htlc_forwards();
+	check_added_monitors(&nodes[1], 1);
+	let bs_fail = get_htlc_update_msgs(&nodes[1], &node_a_id);
+	nodes[0].node.handle_update_fail_htlc(node_b_id, &bs_fail.update_fail_htlcs[0]);
+	commitment_signed_dance!(nodes[0], nodes[1], bs_fail.commitment_signed, true, true);
+	expect_payment_failed!(nodes[0], hash_a, false);
+}
+
+#[test]
+fn test_lost_timeout_monitor_events() {
+	do_test_lost_timeout_monitor_events(true);
+	do_test_lost_timeout_monitor_events(false);
+}