@@ -571,6 +571,15 @@ impl Primary {
571571 "Checked for ready Cars after batch sync"
572572 ) ;
573573 for car in ready_cars {
574+ // Skip if this is our own Car (defensive check)
575+ if car. proposer == self . config . validator_id {
576+ trace ! (
577+ position = car. position,
578+ "Skipping attestation for our own Car in batch sync"
579+ ) ;
580+ continue ;
581+ }
582+
574583 // IMPORTANT: The Car was already validated when first received
575584 // (position check, signature, parent_ref all passed). We queued
576585 // it only because batches were missing. Now that batches are
@@ -624,6 +633,11 @@ impl Primary {
624633 "Received consensus decision notification"
625634 ) ;
626635
636+ // CRITICAL: Update last_cut to the DECIDED cut (not what we proposed)
637+ // This ensures monotonicity checks in form_cut use the actual consensus
638+ // state, preventing stale proposed cuts from causing violations
639+ self . last_cut = Some ( cut. clone ( ) ) ;
640+
627641 // CRITICAL: Sync position tracking from the decided Cut BEFORE advancing state
628642 // This ensures validators that missed some CARs during collection still have
629643 // consistent position tracking for subsequent heights
@@ -858,6 +872,15 @@ impl Primary {
858872
859873 /// Handle a received Car
860874 async fn handle_received_car ( & mut self , from : ValidatorId , car : Car ) {
875+ // Skip if this is our own Car (we handle our own Cars through attestation collection)
876+ if car. proposer == self . config . validator_id {
877+ trace ! (
878+ position = car. position,
879+ "Ignoring received Car for our own Car"
880+ ) ;
881+ return ;
882+ }
883+
861884 // DIAGNOSTIC: Log at INFO level for batched Cars to trace attestation flow
862885 let batch_count = car. batch_digests . len ( ) ;
863886 if batch_count > 0 {
@@ -982,6 +1005,48 @@ impl Primary {
9821005 "Position gap detected, initiating gap recovery"
9831006 ) ;
9841007
1008+ // CRITICAL FIX: Update position tracking when we're behind
1009+ //
1010+ // If actual > expected, we've fallen behind this validator's position.
1011+ // The Car signature was already validated by core.handle_car(), so we
1012+ // know this is a legitimate Car. Update tracking to prevent death spiral:
1013+ //
1014+ // Without this fix:
1015+ // 1. We miss some Cars from validator X
1016+ // 2. New Cars from X trigger position gap errors
1017+ // 3. We never attest to X's new Cars
1018+ // 4. X's Cars never reach quorum
1019+ // 5. Transactions stuck forever
1020+ //
1021+ // With this fix:
1022+ // - We update tracking to actual position
1023+ // - Next Car from X (at actual+1) will be attested normally
1024+ // - Network recovers quickly
1025+ if actual > expected {
1026+ let car_hash = car. hash ( ) ;
1027+ info ! (
1028+ proposer = %validator,
1029+ expected,
1030+ actual,
1031+ car_hash = %car_hash,
1032+ "Updating position tracking to recover from gap - signature was valid"
1033+ ) ;
1034+ self . state . update_last_seen ( validator, actual, car_hash) ;
1035+
1036+ // Also generate attestation since signature is valid and we're syncing
1037+ // This helps the network reach quorum faster
1038+ let attestation = self . core . create_attestation ( & car) ;
1039+ DCL_ATTESTATIONS_SENT . inc ( ) ;
1040+ self . network
1041+ . send_attestation ( car. proposer , & attestation)
1042+ . await ;
1043+ info ! (
1044+ proposer = %validator,
1045+ position = actual,
1046+ "Generated attestation after position gap recovery"
1047+ ) ;
1048+ }
1049+
9851050 // Queue the out-of-order Car for later processing
9861051 if !self . state . is_awaiting_gap_sync ( & validator, actual) {
9871052 self . state . queue_car_awaiting_gap ( car. clone ( ) , expected) ;
@@ -1228,6 +1293,30 @@ impl Primary {
12281293 "Received valid CarWithAttestation from peer"
12291294 ) ;
12301295
1296+ // CRITICAL FIX: Update position tracking from attested Car broadcasts
1297+ //
1298+ // Without this, validators can fall into a position gap death spiral:
1299+ // 1. Validator A's Cars don't reach quorum (for whatever reason)
1300+ // 2. Other validators' last_seen_positions[A] becomes stale
1301+ // 3. When A broadcasts new Cars, others detect a "position gap"
1302+ // 4. No attestations are generated, A's Cars never reach quorum
1303+ // 5. The gap grows forever
1304+ //
1305+ // By updating position tracking when we receive a valid CarWithAttestation
1306+ // (which has quorum verification), we stay in sync even if we missed
1307+ // some intermediate Cars. This breaks the death spiral.
1308+ let current_pos = self . state . last_seen_positions . get ( & car. proposer ) . copied ( ) ;
1309+ if current_pos. is_none_or ( |p| car. position > p) {
1310+ self . state
1311+ . update_last_seen ( car. proposer , car. position , car_hash) ;
1312+ info ! (
1313+ proposer = %car. proposer,
1314+ old_position = current_pos,
1315+ new_position = car. position,
1316+ "Updated position tracking from CarWithAttestation broadcast"
1317+ ) ;
1318+ }
1319+
12311320 // Persist attestation to storage if available
12321321 if let Some ( ref storage) = self . storage {
12331322 if let Err ( e) = storage. put_attestation ( attestation. clone ( ) ) . await {
@@ -1377,20 +1466,39 @@ impl Primary {
13771466 ) ;
13781467 // Could re-broadcast Car here
13791468 } else {
1380- // IMPORTANT: Don't timeout Cars with batches!
1381- // Peers need extra time to sync batch data before they can attest.
1382- // Without this, batched Cars timeout before peers finish syncing,
1383- // causing attestations to be rejected with UnknownCar error.
1469+ // Max backoff exceeded
13841470 if has_batches {
1385- // Reset the timeout without losing existing attestations
1386- info ! (
1387- hash = %hash,
1388- position = car. position,
1389- batch_count = car. batch_digests. len( ) ,
1390- attestation_count = self . attestation_collector. attestation_count( & hash) . unwrap_or( 0 ) ,
1391- "Extending timeout for batched Car - peers may still be syncing"
1392- ) ;
1393- self . attestation_collector . reset_timeout ( & hash) ;
1471+ // Try to extend timeout for batched Cars that need more time.
1472+ // reset_timeout() returns false if max resets exceeded.
1473+ let reset_count = self . attestation_collector . reset_count ( & hash) . unwrap_or ( 0 ) ;
1474+ if self . attestation_collector . reset_timeout ( & hash) {
1475+ info ! (
1476+ hash = %hash,
1477+ position = car. position,
1478+ batch_count = car. batch_digests. len( ) ,
1479+ attestation_count = self . attestation_collector. attestation_count( & hash) . unwrap_or( 0 ) ,
1480+ reset_count = reset_count + 1 ,
1481+ "Extending timeout for batched Car - peers may still be syncing"
1482+ ) ;
1483+ } else {
1484+ // Max resets exceeded - drop the Car and restore batches
1485+ warn ! (
1486+ hash = %hash,
1487+ position = car. position,
1488+ batch_count = car. batch_digests. len( ) ,
1489+ attestation_count = self . attestation_collector. attestation_count( & hash) . unwrap_or( 0 ) ,
1490+ reset_count,
1491+ "Batched Car exceeded max timeout resets - dropping and restoring batches"
1492+ ) ;
1493+ self . attestation_collector . remove ( & hash) ;
1494+ self . state . remove_pending_car ( & hash) ;
1495+
1496+ // Restore batch digests to pending so they can be re-batched
1497+ // This ensures transactions are not lost
1498+ for digest in & car. batch_digests {
1499+ self . state . add_batch_digest ( digest. clone ( ) ) ;
1500+ }
1501+ }
13941502 } else {
13951503 warn ! (
13961504 hash = %hash,
0 commit comments