Added LogPoller metric to track skipped blocks

amit-momin · amit-momin · commit 3d5a14720f4c · 2026-04-08T16:33:33.000-05:00
diff --git a/pkg/solana/logpoller/job_get_block.go b/pkg/solana/logpoller/job_get_block.go
@@ -53,6 +53,7 @@ func (j *getBlockJob) Done() <-chan struct{} {
 
 func (j *getBlockJob) Abort(ctx context.Context) error {
 	j.aborted = true
+	j.metrics.SetBlockSkipped(ctx, int64(j.slotNumber)) // nolint:gosec // G115: integer overflow conversion uint64 -> int64
 	var abort types.Block
 	abort.Aborted = true
 	abort.SlotNumber = j.slotNumber
diff --git a/pkg/solana/logpoller/job_get_block_test.go b/pkg/solana/logpoller/job_get_block_test.go
@@ -27,13 +27,15 @@ type solLpPromTest struct {
 	id                 string
 	txsTruncated       outcomeDependantTestMetric
 	txsLogParsingError outcomeDependantTestMetric
+	blockSkipped       float64
 }
 
 func (p solLpPromTest) assertEqual(t *testing.T) {
 	assert.InDelta(t, p.txsTruncated.succeeded, testutil.ToFloat64(promSolLp.txsTruncated.succeeded.WithLabelValues(p.id)), 0.0001, "mismatch: truncated succeeded")
 	assert.InDelta(t, p.txsTruncated.reverted, testutil.ToFloat64(promSolLp.txsTruncated.reverted.WithLabelValues(p.id)), 0.0001, "mismatch: truncated reverted")
 	assert.InDelta(t, p.txsLogParsingError.succeeded, testutil.ToFloat64(promSolLp.txsLogParsingError.succeeded.WithLabelValues(p.id)), 0.0001, "mismatch: log parsing error succeeded")
 	assert.InDelta(t, p.txsLogParsingError.reverted, testutil.ToFloat64(promSolLp.txsLogParsingError.reverted.WithLabelValues(p.id)), 0.0001, "mismatch: log parsing error reverted")
+	assert.InDelta(t, p.blockSkipped, testutil.ToFloat64(promLpBlockSkipped.WithLabelValues(p.id)), 0.0001, "mismatch: block skipped")
 }
 
 // resetPromMetricsForLabel clears the prometheus counters for the given label
@@ -43,6 +45,7 @@ func resetPromMetricsForLabel(label string) {
 	promSolLp.txsTruncated.reverted.DeleteLabelValues(label)
 	promSolLp.txsLogParsingError.succeeded.DeleteLabelValues(label)
 	promSolLp.txsLogParsingError.reverted.DeleteLabelValues(label)
+	promLpBlockSkipped.DeleteLabelValues(label)
 }
 
 func TestGetBlockJob(t *testing.T) {
@@ -186,6 +189,32 @@ func TestGetBlockJob(t *testing.T) {
 		default:
 		}
 	})
+	t.Run("Abort emits block skipped metric", func(t *testing.T) {
+		resetPromMetricsForLabel(t.Name())
+		lggr := logger.Sugared(logger.Test(t))
+		metrics, err := NewSolLpMetrics(t.Name())
+		require.NoError(t, err)
+		blocks := make(chan types.Block, 1)
+		job := newGetBlockJob(nil, nil, blocks, lggr, slotNumber, metrics, nil)
+		err = job.Abort(t.Context())
+		require.NoError(t, err)
+
+		result := <-blocks
+		assert.True(t, result.Aborted)
+		assert.Equal(t, slotNumber, result.SlotNumber)
+
+		expectedMetrics := solLpPromTest{
+			id:           t.Name(),
+			blockSkipped: float64(slotNumber),
+		}
+		expectedMetrics.assertEqual(t)
+
+		select {
+		case <-job.Done():
+		default:
+			t.Fatal("expected job to be done")
+		}
+	})
 	t.Run("Happy path", func(t *testing.T) {
 		resetPromMetricsForLabel(t.Name()) // Reset counters to avoid accumulation across test runs
 		client := mocks.NewRPCClient(t)
diff --git a/pkg/solana/logpoller/metrics.go b/pkg/solana/logpoller/metrics.go
@@ -29,6 +29,11 @@ var promLpLastProcessedSlot = promauto.NewGaugeVec(prometheus.GaugeOpts{
 	Help: "Last processed slot by log poller",
 }, []string{"chainID"})
 
+var promLpBlockSkipped = promauto.NewGaugeVec(prometheus.GaugeOpts{
+	Name: "solana_log_poller_block_skipped",
+	Help: "Slot number of the most recently skipped block due to max retry exhaustion",
+}, []string{"chainID"})
+
 type solLpMetrics struct {
 	metrics.Labeler
 	chainID string
@@ -37,6 +42,7 @@ type solLpMetrics struct {
 	txsTruncated       outcomeDependantMetric
 	txsLogParsingError outcomeDependantMetric
 	lastProcessedSlot  metric.Int64Gauge
+	blockSkipped       metric.Int64Gauge
 }
 
 func NewSolLpMetrics(chainID string) (*solLpMetrics, error) {
@@ -57,13 +63,19 @@ func NewSolLpMetrics(chainID string) (*solLpMetrics, error) {
 		return nil, fmt.Errorf("failed to register solana_log_poller_last_processed_slot: %w", err)
 	}
 
+	blockSkipped, err := meter.Int64Gauge("solana_log_poller_block_skipped")
+	if err != nil {
+		return nil, fmt.Errorf("failed to register solana_log_poller_block_skipped: %w", err)
+	}
+
 	return &solLpMetrics{
 		chainID: chainID,
 		Labeler: metrics.NewLabeler().With("chainID", chainID),
 
 		txsTruncated:       *truncatedTxs,
 		txsLogParsingError: *txLogParsingError,
 		lastProcessedSlot:  lastProcessedSlot,
+		blockSkipped:       blockSkipped,
 	}, nil
 }
 
@@ -84,6 +96,11 @@ func (m *solLpMetrics) SetLatestProcessedSlot(ctx context.Context, slot int64) {
 	m.lastProcessedSlot.Record(ctx, slot, metric.WithAttributes(m.GetOtelAttributes()...))
 }
 
+func (m *solLpMetrics) SetBlockSkipped(ctx context.Context, slotNumber int64) {
+	promLpBlockSkipped.WithLabelValues(m.chainID).Set(float64(slotNumber))
+	m.blockSkipped.Record(ctx, slotNumber, metric.WithAttributes(m.GetOtelAttributes()...))
+}
+
 func (m *solLpMetrics) incrementForOutcome(ctx context.Context, prom outcomeDependantProm, me outcomeDependantMetric, outcome txOutcome) {
 	switch outcome {
 	case txSucceeded:
diff --git a/pkg/solana/logpoller/worker/worker.go b/pkg/solana/logpoller/worker/worker.go
@@ -236,21 +236,21 @@ func (g *Group) runRetryQueue(ctx context.Context) {
 
 				// retry count starts at 0 so check if it equals max retry count to determine if it has reached the threshold
 				if retry.count >= g.maxRetryCount {
-					g.lggr.Criticalf("job %s exceeded max retry count %d. Resolution most likely requires manual intervention. Errors: %s", failedAttempt.Job, g.maxRetryCount, errors.Join(retry.errs...))
+					g.lggr.Criticalw("job exceeded max retry count. Resolution most likely requires manual intervention.", "job", failedAttempt.Job, "maxRetryCount", g.maxRetryCount, "errors", errors.Join(retry.errs...))
 					if err := retry.Abort(ctx); err != nil {
-						g.lggr.Criticalf("failed to abort retry: %s", err)
+						g.lggr.Criticalw("failed to abort retry", "err", err)
 					}
 					// Continue to avoid adding job back to retry map
 					continue
 				}
 
-				g.lggr.Errorf("retrying job %s in %s", failedAttempt.Job, wait)
+				g.lggr.Errorw("retrying job", "job", failedAttempt.Job, "wait", wait)
 				retry.when = time.Now().Add(wait)
 			default:
 				// first retry
 				wait := calculateExponentialBackoffWithJitter(0)
 
-				g.lggr.Errorf("retrying job %s in %s", failedAttempt.Job, wait)
+				g.lggr.Errorw("retrying job", "job", failedAttempt.Job, "wait", wait)
 
 				retry = retryableJob{
 					name: createRandomString(12),
@@ -264,7 +264,7 @@ func (g *Group) runRetryQueue(ctx context.Context) {
 			g.retryMap[retry.name] = retry
 
 			if len(g.retryMap) >= DefaultNotifyRetryDepth {
-				g.lggr.Errorf("retry queue depth: %d", len(g.retryMap))
+				g.lggr.Errorw("retry queue depth", "depth", len(g.retryMap))
 			}
 			g.mu.Unlock()
 		}