@@ -20,6 +20,7 @@ use crate::message_queue::MessageQueue;
2020use crate :: persist:: {
2121 LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE , LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE ,
2222} ;
23+ use crate :: prelude:: hash_map:: Entry ;
2324use crate :: prelude:: * ;
2425use crate :: sync:: { Arc , Mutex , RwLock , RwLockWriteGuard } ;
2526use crate :: utils:: time:: TimeProvider ;
@@ -35,6 +36,7 @@ use lightning::util::persist::KVStore;
3536use lightning:: util:: ser:: Writeable ;
3637
3738use core:: ops:: Deref ;
39+ use core:: sync:: atomic:: { AtomicUsize , Ordering } ;
3840use core:: time:: Duration ;
3941
4042use alloc:: string:: String ;
@@ -139,6 +141,7 @@ where
139141 node_signer : NS ,
140142 kv_store : K ,
141143 last_pruning : Mutex < Option < LSPSDateTime > > ,
144+ persistence_in_flight : AtomicUsize ,
142145}
143146
144147impl < CM : Deref , NS : Deref , K : Deref + Clone , TP : Deref > LSPS5ServiceHandler < CM , NS , K , TP >
@@ -166,6 +169,7 @@ where
166169 node_signer,
167170 kv_store,
168171 last_pruning : Mutex :: new ( None ) ,
172+ persistence_in_flight : AtomicUsize :: new ( 0 ) ,
169173 }
170174 }
171175
@@ -220,6 +224,8 @@ where
220224
221225 let key = counterparty_node_id. to_string ( ) ;
222226
227+ // Begin the write with the `per_peer_state` write lock held to avoid racing with
228+ // potentially-in-flight `persist` calls writing state for the same peer.
223229 self . kv_store . write (
224230 LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE ,
225231 LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE ,
@@ -242,38 +248,80 @@ where
242248 // TODO: We should eventually persist in parallel, however, when we do, we probably want to
243249 // introduce some batching to upper-bound the number of requests inflight at any given
244250 // time.
245- let mut need_remove = Vec :: new ( ) ;
246- let mut need_persist = Vec :: new ( ) ;
247- {
248- let mut outer_state_lock = self . per_peer_state . write ( ) . unwrap ( ) ;
249- self . check_prune_stale_webhooks ( & mut outer_state_lock) ;
250-
251- outer_state_lock. retain ( |client_id, peer_state| {
252- let is_prunable = peer_state. is_prunable ( ) ;
253- let has_open_channel = self . client_has_open_channel ( client_id) ;
254- if is_prunable && !has_open_channel {
255- need_remove. push ( * client_id) ;
256- } else if peer_state. needs_persist {
257- need_persist. push ( * client_id) ;
258- }
259- !is_prunable || has_open_channel
260- } ) ;
261- } ;
262251
263- for counterparty_node_id in need_persist. into_iter ( ) {
264- debug_assert ! ( !need_remove. contains( & counterparty_node_id) ) ;
265- self . persist_peer_state ( counterparty_node_id) . await ?;
252+ if self . persistence_in_flight . fetch_add ( 1 , Ordering :: AcqRel ) > 0 {
253+ // If we're not the first event processor to get here, just return early, the increment
254+ // we just did will be treated as "go around again" at the end.
255+ return Ok ( ( ) ) ;
266256 }
267257
268- for counterparty_node_id in need_remove {
269- let key = counterparty_node_id. to_string ( ) ;
270- self . kv_store
271- . remove (
272- LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE ,
273- LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE ,
274- & key,
275- )
276- . await ?;
258+ loop {
259+ let mut need_remove = Vec :: new ( ) ;
260+ let mut need_persist = Vec :: new ( ) ;
261+
262+ self . check_prune_stale_webhooks ( & mut self . per_peer_state . write ( ) . unwrap ( ) ) ;
263+ {
264+ let outer_state_lock = self . per_peer_state . read ( ) . unwrap ( ) ;
265+
266+ for ( client_id, peer_state) in outer_state_lock. iter ( ) {
267+ let is_prunable = peer_state. is_prunable ( ) ;
268+ let has_open_channel = self . client_has_open_channel ( client_id) ;
269+ if is_prunable && !has_open_channel {
270+ need_remove. push ( * client_id) ;
271+ } else if peer_state. needs_persist {
272+ need_persist. push ( * client_id) ;
273+ }
274+ }
275+ }
276+
277+ for client_id in need_persist. into_iter ( ) {
278+ debug_assert ! ( !need_remove. contains( & client_id) ) ;
279+ self . persist_peer_state ( client_id) . await ?;
280+ }
281+
282+ for client_id in need_remove {
283+ let mut future_opt = None ;
284+ {
285+ // We need to take the `per_peer_state` write lock to remove an entry, but also
286+ // have to hold it until after the `remove` call returns (but not through
287+ // future completion) to ensure that writes for the peer's state are
288+ // well-ordered with other `persist_peer_state` calls even across the removal
289+ // itself.
290+ let mut per_peer_state = self . per_peer_state . write ( ) . unwrap ( ) ;
291+ if let Entry :: Occupied ( mut entry) = per_peer_state. entry ( client_id) {
292+ let state = entry. get_mut ( ) ;
293+ if state. is_prunable ( ) && !self . client_has_open_channel ( & client_id) {
294+ entry. remove ( ) ;
295+ let key = client_id. to_string ( ) ;
296+ future_opt = Some ( self . kv_store . remove (
297+ LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE ,
298+ LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE ,
299+ & key,
300+ ) ) ;
301+ } else {
302+ // If the peer was re-added, force a re-persist of the current state.
303+ state. needs_persist = true ;
304+ }
305+ } else {
306+ // This should never happen, we can only have one `persist` call
307+ // in-progress at once and map entries are only removed by it.
308+ debug_assert ! ( false ) ;
309+ }
310+ }
311+ if let Some ( future) = future_opt {
312+ future. await ?;
313+ } else {
314+ self . persist_peer_state ( client_id) . await ?;
315+ }
316+ }
317+
318+ if self . persistence_in_flight . fetch_sub ( 1 , Ordering :: AcqRel ) != 1 {
319+ // If another thread incremented the state while we were running we should go
320+ // around again, but only once.
321+ self . persistence_in_flight . store ( 1 , Ordering :: Release ) ;
322+ continue ;
323+ }
324+ break ;
277325 }
278326
279327 Ok ( ( ) )
@@ -761,7 +809,7 @@ impl PeerState {
761809 } ) ;
762810 }
763811
764- fn is_prunable ( & mut self ) -> bool {
812+ fn is_prunable ( & self ) -> bool {
765813 self . webhooks . is_empty ( )
766814 }
767815}
0 commit comments