@@ -47,6 +47,9 @@ use lambdaworks_crypto::merkle_tree::traits::IsMerkleTreeBackend;
4747use log:: { debug, error, info, warn} ;
4848use tokio:: net:: { TcpListener , TcpStream } ;
4949use tokio:: sync:: { Mutex , MutexGuard , RwLock } ;
50+
51+ // Message handler lock timeout
52+ const MESSAGE_HANDLER_LOCK_TIMEOUT : Duration = Duration :: from_secs ( 10 ) ;
5053use tokio_tungstenite:: tungstenite:: { Error , Message } ;
5154use types:: batch_queue:: { self , BatchQueueEntry , BatchQueueEntryPriority } ;
5255use types:: errors:: { BatcherError , TransactionSendError } ;
@@ -112,11 +115,6 @@ pub struct Batcher {
112115 // Because of this exception, user message handling uses lock acquisition with timeouts.
113116 batch_state : Mutex < BatchState > ,
114117
115- /// Flag to indicate when recovery is in progress
116- /// When true, message handlers will return ServerBusy responses
117- /// It's used a way to "lock" all the user_states at the same time
118- /// If one needed is taken in the handle message it will time out
119- is_recovering_from_submission_failure : RwLock < bool > ,
120118 user_states : Arc < RwLock < HashMap < Address , Arc < Mutex < UserState > > > > > ,
121119
122120 last_uploaded_batch_block : Mutex < u64 > ,
@@ -335,7 +333,6 @@ impl Batcher {
335333 batch_state : Mutex :: new ( batch_state) ,
336334 user_states,
337335 disabled_verifiers : Mutex :: new ( disabled_verifiers) ,
338- is_recovering_from_submission_failure : RwLock :: new ( false ) ,
339336 metrics,
340337 telemetry,
341338 }
@@ -423,7 +420,7 @@ impl Batcher {
423420 where
424421 F : std:: future:: Future < Output = T > ,
425422 {
426- match timeout ( Duration :: from_secs ( 15 ) , lock_future) . await {
423+ match timeout ( MESSAGE_HANDLER_LOCK_TIMEOUT , lock_future) . await {
427424 Ok ( result) => Some ( result) ,
428425 Err ( _) => {
429426 warn ! ( "User lock acquisition timed out for address {}" , addr) ;
@@ -438,7 +435,7 @@ impl Batcher {
438435 where
439436 F : std:: future:: Future < Output = T > ,
440437 {
441- match timeout ( Duration :: from_secs ( 15 ) , lock_future) . await {
438+ match timeout ( MESSAGE_HANDLER_LOCK_TIMEOUT , lock_future) . await {
442439 Ok ( result) => Some ( result) ,
443440 Err ( _) => {
444441 warn ! ( "Batch lock acquisition timed out" ) ;
@@ -709,17 +706,6 @@ impl Batcher {
709706 mut address : Address ,
710707 ws_conn_sink : WsMessageSink ,
711708 ) -> Result < ( ) , Error > {
712- // Check if restoration is in progress
713- if * self . is_recovering_from_submission_failure . read ( ) . await {
714- warn ! (
715- "Rejecting nonce request from {} during restoration" ,
716- address
717- ) ;
718- let response = GetNonceResponseMessage :: ServerBusy ;
719- send_message ( ws_conn_sink, response) . await ;
720- return Ok ( ( ) ) ;
721- }
722-
723709 // If the address is not paying, we will return the nonce of the aligned_payment_address
724710 if !self . has_to_pay ( & address) {
725711 info ! ( "Handling nonpaying message" ) ;
@@ -741,7 +727,21 @@ impl Batcher {
741727 }
742728
743729 let cached_user_nonce = {
744- let user_state_ref = self . user_states . read ( ) . await . get ( & address) . cloned ( ) ;
730+ let user_states_guard = match timeout (
731+ MESSAGE_HANDLER_LOCK_TIMEOUT ,
732+ self . user_states . read ( ) ,
733+ )
734+ . await
735+ {
736+ Ok ( guard) => guard,
737+ Err ( _) => {
738+ warn ! ( "User states read lock acquisition timed out in handle_get_nonce_for_address_msg" ) ;
739+ self . metrics . inc_message_handler_user_states_lock_timeouts ( ) ;
740+ send_message ( ws_conn_sink, GetNonceResponseMessage :: ServerBusy ) . await ;
741+ return Ok ( ( ) ) ;
742+ }
743+ } ;
744+ let user_state_ref = user_states_guard. get ( & address) . cloned ( ) ;
745745 match user_state_ref {
746746 Some ( user_state_ref) => {
747747 let Some ( user_state_guard) = self
@@ -804,21 +804,6 @@ impl Batcher {
804804 debug ! ( "Received message with nonce: {msg_nonce:?}" ) ;
805805 self . metrics . received_proofs . inc ( ) ;
806806
807- // Check if restoration is in progress
808- if * self . is_recovering_from_submission_failure . read ( ) . await {
809- warn ! (
810- "Rejecting proof submission from {} during restoration (nonce: {})" ,
811- client_msg
812- . verification_data
813- . verification_data
814- . proof_generator_addr,
815- msg_nonce
816- ) ;
817- let response = SubmitProofResponseMessage :: ServerBusy ;
818- send_message ( ws_conn_sink, response) . await ;
819- return Ok ( ( ) ) ;
820- }
821-
822807 // * ---------------------------------------------------*
823808 // * Perform validations over the message *
824809 // * ---------------------------------------------------*
@@ -881,7 +866,17 @@ impl Batcher {
881866 // If it was not present, then the user nonce is queried to the Aligned contract.
882867 // Lastly, we get a lock of the batch state again and insert the user state if it was still missing.
883868
884- let is_user_in_state = self . user_states . read ( ) . await . contains_key ( & addr) ;
869+ let is_user_in_state = match timeout ( MESSAGE_HANDLER_LOCK_TIMEOUT , self . user_states . read ( ) )
870+ . await
871+ {
872+ Ok ( user_states_guard) => user_states_guard. contains_key ( & addr) ,
873+ Err ( _) => {
874+ warn ! ( "User states read lock acquisition timed out in handle_submit_proof_msg (user check)" ) ;
875+ self . metrics . inc_message_handler_user_states_lock_timeouts ( ) ;
876+ send_message ( ws_conn_sink, SubmitProofResponseMessage :: ServerBusy ) . await ;
877+ return Ok ( ( ) ) ;
878+ }
879+ } ;
885880
886881 if !is_user_in_state {
887882 // If the user state was not present, we need to get the nonce from the Ethereum contract
@@ -903,14 +898,32 @@ impl Batcher {
903898 debug ! ( "User state for address {addr:?} not found, creating a new one" ) ;
904899 // We add a dummy user state to grab a lock on the user state
905900 let dummy_user_state = UserState :: new ( ethereum_user_nonce) ;
906- self . user_states
907- . write ( )
908- . await
909- . insert ( addr, Arc :: new ( Mutex :: new ( dummy_user_state) ) ) ;
901+ match timeout ( MESSAGE_HANDLER_LOCK_TIMEOUT , self . user_states . write ( ) ) . await {
902+ Ok ( mut user_states_guard) => {
903+ user_states_guard. insert ( addr, Arc :: new ( Mutex :: new ( dummy_user_state) ) ) ;
904+ }
905+ Err ( _) => {
906+ warn ! ( "User states write lock acquisition timed out in handle_submit_proof_msg (user creation)" ) ;
907+ self . metrics . inc_message_handler_user_states_lock_timeouts ( ) ;
908+ send_message ( ws_conn_sink, SubmitProofResponseMessage :: ServerBusy ) . await ;
909+ return Ok ( ( ) ) ;
910+ }
911+ } ;
910912 debug ! ( "Dummy user state for address {addr:?} created" ) ;
911913 }
912914
913- let Some ( user_state_ref) = self . user_states . read ( ) . await . get ( & addr) . cloned ( ) else {
915+ let user_state_ref = match timeout ( MESSAGE_HANDLER_LOCK_TIMEOUT , self . user_states . read ( ) )
916+ . await
917+ {
918+ Ok ( user_states_guard) => user_states_guard. get ( & addr) . cloned ( ) ,
919+ Err ( _) => {
920+ warn ! ( "User states read lock acquisition timed out in handle_submit_proof_msg (user retrieval)" ) ;
921+ self . metrics . inc_message_handler_user_states_lock_timeouts ( ) ;
922+ send_message ( ws_conn_sink, SubmitProofResponseMessage :: ServerBusy ) . await ;
923+ return Ok ( ( ) ) ;
924+ }
925+ } ;
926+ let Some ( user_state_ref) = user_state_ref else {
914927 error ! ( "This should never happen, user state has previously been inserted if it didn't exist" ) ;
915928 send_message (
916929 ws_conn_sink. clone ( ) ,
@@ -1621,9 +1634,7 @@ impl Batcher {
16211634 failed_batch. len( )
16221635 ) ;
16231636
1624- // Set restoration flag to stop handling new user messages
1625- * self . is_recovering_from_submission_failure . write ( ) . await = true ;
1626-
1637+ let user_states_lock = self . user_states . write ( ) . await ;
16271638 let mut batch_state_lock = self . batch_state . lock ( ) . await ;
16281639 let mut restored_entries = Vec :: new ( ) ;
16291640
@@ -1689,8 +1700,8 @@ impl Batcher {
16891700 // Only auxiliary user data (max_min_fee) can be "inconsistent"
16901701 // but we can keep updating it without locking the queue
16911702 info ! ( "Queue recovered from submission failure, resuming user processing and updating user states metadata" ) ;
1703+ std:: mem:: drop ( user_states_lock) ;
16921704 std:: mem:: drop ( batch_state_lock) ;
1693- * self . is_recovering_from_submission_failure . write ( ) . await = false ;
16941705
16951706 info ! ( "Updating user states after proof restoration..." ) ;
16961707 if let Err ( e) = self
0 commit comments