wormholelabs-xyz
diff --git a/‎node/pkg/processor/cleanup.go‎
Lines changed: 177 additions & 145 deletions b/‎node/pkg/processor/cleanup.go‎
Lines changed: 177 additions & 145 deletions
diff --git a/‎node/pkg/processor/message.go‎
Lines changed: 3 additions & 2 deletions b/‎node/pkg/processor/message.go‎
Lines changed: 3 additions & 2 deletions
@@ -70,179 +70,196 @@ var (
 
 // handleCleanup handles periodic retransmissions and cleanup of observations
 func (p *Processor) handleCleanup(ctx context.Context) {
+	p.cleanupState()
+	p.cleanupPythnetVaas()
+}
+
+// cleanupState walks through the aggregation state map and cleans up entries that are no longer needed. It grabs the state lock.
+func (p *Processor) cleanupState() {
+	p.state.signaturesLock.Lock()
+	defer p.state.signaturesLock.Unlock()
+
 	p.logger.Info("aggregation state summary", zap.Int("cached", len(p.state.signatures)))
 	aggregationStateEntries.Set(float64(len(p.state.signatures)))
 
 	for hash, s := range p.state.signatures {
-		delta := time.Since(s.firstObserved)
-
-		if !s.submitted && s.ourObservation != nil && delta > settlementTime {
-			// Expire pending VAAs post settlement time if we have a stored quorum VAA.
-			//
-			// This occurs when we observed a message after the cluster has already reached
-			// consensus on it, causing us to never achieve quorum.
-			if ourVaa, ok := s.ourObservation.(*VAA); ok {
-				if p.haveSignedVAA(*db.VaaIDFromVAA(&ourVaa.VAA)) {
-					// If we have a stored quorum VAA, we can safely expire the state.
-					//
-					// This is a rare case, and we can safely expire the state, since we
-					// have a quorum VAA.
-					p.logger.Info("Expiring late VAA",
-						zap.String("message_id", ourVaa.VAA.MessageID()),
-						zap.String("digest", hash),
-						zap.Duration("delta", delta),
-					)
-					aggregationStateLate.Inc()
-					delete(p.state.signatures, hash)
-					continue
-				}
-			}
+		if shouldDelete := p.cleanUpStateEntry(hash, s); shouldDelete {
+			delete(p.state.signatures, hash) // Can't use p.state.delete() because we're holding the lock.
 		}
+	}
+}
 
-		switch {
-		case !s.settled && delta > settlementTime:
-			// After 30 seconds, the observation is considered settled - it's unlikely that more observations will
-			// arrive, barring special circumstances. This is a better time to count misses than submission,
-			// because we submit right when we quorum rather than waiting for all observations to arrive.
-			s.settled = true
-
-			// Use either the most recent (in case of a observation we haven't seen) or stored gs, if available.
-			var gs *common.GuardianSet
-			if s.gs != nil {
-				gs = s.gs
-			} else {
-				gs = p.gs
-			}
-
-			hasSigs := len(s.signatures)
-			quorum := hasSigs >= gs.Quorum()
+// cleanUpStateEntry cleans up a single aggregation state entry. It grabs the lock for that entry. Returns true if the entry should be deleted.
+func (p *Processor) cleanUpStateEntry(hash string, s *state) bool {
+	s.lock.Lock()
+	defer s.lock.Unlock()
 
-			var chain vaa.ChainID
-			if s.ourObservation != nil {
-				chain = s.ourObservation.GetEmitterChain()
-			}
+	delta := time.Since(s.firstObserved)
 
-			if p.logger.Level().Enabled(zapcore.DebugLevel) {
-				p.logger.Debug("observation considered settled",
-					zap.String("message_id", s.LoggingID()),
+	if !s.submitted && s.ourObservation != nil && delta > settlementTime {
+		// Expire pending VAAs post settlement time if we have a stored quorum VAA.
+		//
+		// This occurs when we observed a message after the cluster has already reached
+		// consensus on it, causing us to never achieve quorum.
+		if ourVaa, ok := s.ourObservation.(*VAA); ok {
+			if p.haveSignedVAA(*db.VaaIDFromVAA(&ourVaa.VAA)) {
+				// If we have a stored quorum VAA, we can safely expire the state.
+				//
+				// This is a rare case, and we can safely expire the state, since we
+				// have a quorum VAA.
+				p.logger.Info("Expiring late VAA",
+					zap.String("message_id", ourVaa.VAA.MessageID()),
 					zap.String("digest", hash),
 					zap.Duration("delta", delta),
-					zap.Int("have_sigs", hasSigs),
-					zap.Int("required_sigs", gs.Quorum()),
-					zap.Bool("quorum", quorum),
-					zap.Stringer("emitter_chain", chain),
 				)
+				aggregationStateLate.Inc()
+				return true
 			}
+		}
+	}
 
-			for _, k := range gs.Keys {
-				if _, ok := s.signatures[k]; ok {
-					aggregationStateFulfillment.WithLabelValues(k.Hex(), s.source, "present").Inc()
-				} else {
-					aggregationStateFulfillment.WithLabelValues(k.Hex(), s.source, "missing").Inc()
-				}
+	switch {
+	case !s.settled && delta > settlementTime:
+		// After 30 seconds, the observation is considered settled - it's unlikely that more observations will
+		// arrive, barring special circumstances. This is a better time to count misses than submission,
+		// because we submit right when we quorum rather than waiting for all observations to arrive.
+		s.settled = true
+
+		// Peg the appropriate settlement metric using the current guardian set. If we don't have a guardian set (extremely unlikely), we just won't peg the metric.
+		gs := s.gs
+		if gs == nil {
+			gs = p.gst.Get()
+			if gs == nil {
+				return false
 			}
-		case s.submitted && delta.Hours() >= 1:
-			// We could delete submitted observations right away, but then we'd lose context about additional (late)
-			// observation that come in. Therefore, keep it for a reasonable amount of time.
-			// If a very late observation arrives after cleanup, a nil aggregation state will be created
-			// and then expired after a while (as noted in observation.go, this can be abused by a byzantine guardian).
-			if p.logger.Level().Enabled(zapcore.DebugLevel) {
-				p.logger.Debug("expiring submitted observation",
-					zap.String("message_id", s.LoggingID()),
-					zap.String("digest", hash),
-					zap.Duration("delta", delta),
-				)
+		}
+
+		hasSigs := len(s.signatures)
+		wantSigs := vaa.CalculateQuorum(len(gs.Keys))
+		quorum := hasSigs >= wantSigs
+
+		var chain vaa.ChainID
+		if s.ourObservation != nil {
+			chain = s.ourObservation.GetEmitterChain()
+		}
+
+		if p.logger.Level().Enabled(zapcore.DebugLevel) {
+			p.logger.Debug("observation considered settled",
+				zap.String("message_id", s.LoggingID()),
+				zap.String("digest", hash),
+				zap.Duration("delta", delta),
+				zap.Int("have_sigs", hasSigs),
+				zap.Int("required_sigs", wantSigs),
+				zap.Bool("quorum", quorum),
+				zap.Stringer("emitter_chain", chain),
+			)
+		}
+
+		for _, k := range gs.Keys {
+			if _, ok := s.signatures[k]; ok {
+				aggregationStateFulfillment.WithLabelValues(k.Hex(), s.source, "present").Inc()
+			} else {
+				aggregationStateFulfillment.WithLabelValues(k.Hex(), s.source, "missing").Inc()
 			}
-			delete(p.state.signatures, hash)
-			aggregationStateExpiration.Inc()
-		case !s.submitted && ((s.ourObs != nil && delta > retryLimitOurs) || (s.ourObs == nil && delta > retryLimitNotOurs)):
-			// Clearly, this horse is dead and continued beatings won't bring it closer to quorum.
-			p.logger.Info("expiring unsubmitted observation after exhausting retries",
+		}
+	case s.submitted && delta.Hours() >= 1:
+		// We could delete submitted observations right away, but then we'd lose context about additional (late)
+		// observation that come in. Therefore, keep it for a reasonable amount of time.
+		// If a very late observation arrives after cleanup, a nil aggregation state will be created
+		// and then expired after a while (as noted in observation.go, this can be abused by a byzantine guardian).
+		if p.logger.Level().Enabled(zapcore.DebugLevel) {
+			p.logger.Debug("expiring submitted observation",
 				zap.String("message_id", s.LoggingID()),
 				zap.String("digest", hash),
 				zap.Duration("delta", delta),
-				zap.Bool("weObserved", s.ourObs != nil),
 			)
-			delete(p.state.signatures, hash)
-			aggregationStateTimeout.Inc()
-		case !s.submitted && delta >= FirstRetryMinWait && time.Since(s.nextRetry) >= 0:
-			// Poor observation has been unsubmitted for five minutes - clearly, something went wrong.
-			// If we have previously submitted an observation, and it was reliable, we can make another attempt to get
-			// it over the finish line by sending a re-observation request to the network and rebroadcasting our
-			// sig. If we do not have an observation, it means we either never observed it, or it got
-			// revived by a malfunctioning guardian node, in which case, we can't do anything about it
-			// and just delete it to keep our state nice and lean.
-			if s.ourObs != nil {
-				// Unreliable observations cannot be resubmitted and can be considered failed after 5 minutes
-				if !s.ourObservation.IsReliable() {
-					p.logger.Info("expiring unsubmitted unreliable observation",
+		}
+		aggregationStateExpiration.Inc()
+		return true
+	case !s.submitted && ((s.ourMsg != nil && delta > retryLimitOurs) || (s.ourMsg == nil && delta > retryLimitNotOurs)):
+		// Clearly, this horse is dead and continued beatings won't bring it closer to quorum.
+		p.logger.Info("expiring unsubmitted observation after exhausting retries",
+			zap.String("message_id", s.LoggingID()),
+			zap.String("digest", hash),
+			zap.Duration("delta", delta),
+			zap.Bool("weObserved", s.ourMsg != nil),
+		)
+		aggregationStateTimeout.Inc()
+		return true
+	case !s.submitted && delta >= FirstRetryMinWait && time.Since(s.nextRetry) >= 0:
+		// Poor observation has been unsubmitted for five minutes - clearly, something went wrong.
+		// If we have previously submitted an observation, and it was reliable, we can make another attempt to get
+		// it over the finish line by sending a re-observation request to the network and rebroadcasting our
+		// sig. If we do not have an observation, it means we either never observed it, or it got
+		// revived by a malfunctioning guardian node, in which case, we can't do anything about it
+		// and just delete it to keep our state nice and lean.
+		if s.ourMsg != nil {
+			// Unreliable observations cannot be resubmitted and can be considered failed after 5 minutes
+			if !s.ourObservation.IsReliable() {
+				p.logger.Info("expiring unsubmitted unreliable observation",
+					zap.String("message_id", s.LoggingID()),
+					zap.String("digest", hash),
+					zap.Duration("delta", delta),
+				)
+				aggregationStateTimeout.Inc()
+				return true
+			}
+
+			// Reobservation requests should not be resubmitted but we will keep waiting for more observations.
+			if s.ourObservation.IsReobservation() {
+				if p.logger.Level().Enabled(zapcore.DebugLevel) {
+					p.logger.Debug("not submitting reobservation request for reobservation",
 						zap.String("message_id", s.LoggingID()),
 						zap.String("digest", hash),
 						zap.Duration("delta", delta),
 					)
-					delete(p.state.signatures, hash)
-					aggregationStateTimeout.Inc()
-					break
-				}
-
-				// Reobservation requests should not be resubmitted but we will keep waiting for more observations.
-				if s.ourObservation.IsReobservation() {
-					if p.logger.Level().Enabled(zapcore.DebugLevel) {
-						p.logger.Debug("not submitting reobservation request for reobservation",
-							zap.String("message_id", s.LoggingID()),
-							zap.String("digest", hash),
-							zap.Duration("delta", delta),
-						)
-					}
-					break
 				}
+				return false
+			}
 
-				// If we have already stored this VAA, there is no reason for us to request reobservation.
-				alreadyInDB, err := p.signedVaaAlreadyInDB(hash, s)
-				if err != nil {
-					p.logger.Error("failed to check if observation is already in DB, requesting reobservation",
-						zap.String("message_id", s.LoggingID()),
-						zap.String("hash", hash),
-						zap.Error(err))
-				}
+			// If we have already stored this VAA, there is no reason for us to request reobservation.
+			alreadyInDB, err := p.signedVaaAlreadyInDB(hash, s)
+			if err != nil {
+				p.logger.Error("failed to check if observation is already in DB, requesting reobservation",
+					zap.String("message_id", s.LoggingID()),
+					zap.String("hash", hash),
+					zap.Error(err))
+			}
 
-				if alreadyInDB {
-					if p.logger.Level().Enabled(zapcore.DebugLevel) {
-						p.logger.Debug("observation already in DB, not requesting reobservation",
-							zap.String("message_id", s.LoggingID()),
-							zap.String("digest", hash),
-						)
-					}
-				} else {
-					p.logger.Info("resubmitting observation",
+			if alreadyInDB {
+				if p.logger.Level().Enabled(zapcore.DebugLevel) {
+					p.logger.Debug("observation already in DB, not requesting reobservation",
 						zap.String("message_id", s.LoggingID()),
 						zap.String("digest", hash),
-						zap.Duration("delta", delta),
-						zap.String("firstObserved", s.firstObserved.String()),
-						zap.Int("numSignatures", len(s.signatures)),
 					)
-					req := &gossipv1.ObservationRequest{
-						ChainId: uint32(s.ourObservation.GetEmitterChain()),
-						TxHash:  s.txHash,
-					}
-					if err := common.PostObservationRequest(p.obsvReqSendC, req); err != nil {
-						p.logger.Warn("failed to broadcast re-observation request", zap.String("message_id", s.LoggingID()), zap.Error(err))
-					}
-					if s.ourMsg != nil {
-						// This is the case for immediately published messages (as well as anything still pending from before the cutover).
-						p.gossipAttestationSendC <- s.ourMsg
-					} else {
-						p.postObservationToBatch(s.ourObs)
-					}
-					s.retryCtr++
-					s.nextRetry = time.Now().Add(nextRetryDuration(s.retryCtr))
-					aggregationStateRetries.Inc()
 				}
 			} else {
-				// For nil state entries, we log the quorum to determine whether the
-				// network reached consensus without us. We don't know the correct guardian
-				// set, so we simply use the most recent one.
-				hasSigs := len(s.signatures)
+				p.logger.Info("resubmitting observation",
+					zap.String("message_id", s.LoggingID()),
+					zap.String("digest", hash),
+					zap.Duration("delta", delta),
+					zap.String("firstObserved", s.firstObserved.String()),
+				)
+				req := &gossipv1.ObservationRequest{
+					ChainId: uint32(s.ourObservation.GetEmitterChain()),
+					TxHash:  s.txHash,
+				}
+				if err := common.PostObservationRequest(p.obsvReqSendC, req); err != nil {
+					p.logger.Warn("failed to broadcast re-observation request", zap.String("message_id", s.LoggingID()), zap.Error(err))
+				}
+				p.gossipSendC <- s.ourMsg
+				s.retryCtr++
+				s.nextRetry = time.Now().Add(nextRetryDuration(s.retryCtr))
+				aggregationStateRetries.Inc()
+			}
+		} else {
+			// For nil state entries, we log the quorum to determine whether the
+			// network reached consensus without us. We don't know the correct guardian
+			// set, so we simply use the most recent one.
+			hasSigs := len(s.signatures)
+			gs := p.gst.Get()
+			if gs != nil {
+				wantSigs := vaa.CalculateQuorum(len(gs.Keys))
 
 				if p.logger.Level().Enabled(zapcore.DebugLevel) {
 					p.logger.Debug("expiring unsubmitted nil observation",
@@ -254,13 +271,28 @@ func (p *Processor) handleCleanup(ctx context.Context) {
 						zap.Bool("quorum", hasSigs >= p.gs.Quorum()),
 					)
 				}
-				delete(p.state.signatures, hash)
-				aggregationStateUnobserved.Inc()
+			} else {
+				if p.logger.Level().Enabled(zapcore.DebugLevel) {
+					p.logger.Debug("expiring unsubmitted nil observation, gs is nil",
+						zap.String("message_id", s.LoggingID()),
+						zap.String("digest", hash),
+						zap.Duration("delta", delta),
+						zap.Int("have_sigs", hasSigs),
+					)
+				}
 			}
+			aggregationStateUnobserved.Inc()
+			return true
 		}
 	}
 
-	// Clean up old pythnet VAAs.
+	return false
+}
+
+// cleanupPythnetVaas deletes expired pythnet vaas.
+func (p *Processor) cleanupPythnetVaas() {
+	p.pythnetVaaLock.Lock()
+	defer p.pythnetVaaLock.Unlock()
 	oldestTime := time.Now().Add(-time.Hour)
 	for key, pe := range p.pythnetVaas {
 		if pe.updateTime.Before(oldestTime) {
 
@@ -32,7 +32,8 @@ var (
 // handleMessage processes a message received from a chain and instantiates our deterministic copy of the VAA. An
 // event may be received multiple times and must be handled in an idempotent fashion.
 func (p *Processor) handleMessage(k *common.MessagePublication) {
-	if p.gs == nil {
+	gs := p.gst.Get()
+	if gs == nil {
 		p.logger.Warn("dropping observation since we haven't initialized our guardian set yet",
 			zap.String("message_id", k.MessageIDString()),
 			zap.Uint32("nonce", k.Nonce),
@@ -50,7 +51,7 @@ func (p *Processor) handleMessage(k *common.MessagePublication) {
 	v := &VAA{
 		VAA: vaa.VAA{
 			Version:          vaa.SupportedVAAVersion,
-			GuardianSetIndex: p.gs.Index,
+			GuardianSetIndex: gs.Index,
 			Signatures:       nil,
 			Timestamp:        k.Timestamp,
 			Nonce:            k.Nonce,