|
2 | 2 |
|
3 | 3 | module Journaled |
4 | 4 | module Outbox |
5 | | - # Handles metric emission for the Worker |
| 5 | + # Handles metric emission for the Worker and Kinesis senders |
6 | 6 | # |
7 | | - # This class is responsible for collecting and emitting metrics about the outbox queue. |
| 7 | + # This class provides utility methods for collecting and emitting metrics. |
8 | 8 | class MetricEmitter |
9 | | - def initialize(worker_id:) |
10 | | - @worker_id = worker_id |
11 | | - end |
| 9 | + class << self |
| 10 | + # Emit batch processing metrics |
| 11 | + # |
| 12 | + # @param stats [Hash] Processing statistics with :succeeded, :failed_permanently, :failed_transiently |
| 13 | + # @param worker_id [String] ID of the worker processing the batch |
| 14 | + def emit_batch_metrics(stats, worker_id:) |
| 15 | + total_events = stats[:succeeded] + stats[:failed_permanently] + stats[:failed_transiently] |
12 | 16 |
|
13 | | - # Emit batch processing metrics |
14 | | - # |
15 | | - # @param stats [Hash] Processing statistics with :succeeded, :failed_permanently, :failed_transiently |
16 | | - def emit_batch_metrics(stats) |
17 | | - total_events = stats[:succeeded] + stats[:failed_permanently] + stats[:failed_transiently] |
| 17 | + emit_metric('journaled.outbox_event.processed', value: total_events, worker_id:) |
| 18 | + emit_metric('journaled.outbox_event.sent', value: stats[:succeeded], worker_id:) |
| 19 | + emit_metric('journaled.outbox_event.failed', value: stats[:failed_permanently], worker_id:) |
| 20 | + emit_metric('journaled.outbox_event.errored', value: stats[:failed_transiently], worker_id:) |
| 21 | + end |
18 | 22 |
|
19 | | - emit_metric('journaled.worker.batch_process', value: total_events) |
20 | | - emit_metric('journaled.worker.batch_sent', value: stats[:succeeded]) |
21 | | - emit_metric('journaled.worker.batch_failed_permanently', value: stats[:failed_permanently]) |
22 | | - emit_metric('journaled.worker.batch_failed_transiently', value: stats[:failed_transiently]) |
23 | | - end |
| 23 | + # Collect and emit queue metrics |
| 24 | + # |
| 25 | + # This calculates various queue statistics and emits individual metrics for each. |
| 26 | + # @param worker_id [String] ID of the worker collecting metrics |
| 27 | + def emit_queue_metrics(worker_id:) |
| 28 | + metrics = calculate_queue_metrics |
24 | 29 |
|
25 | | - # Collect and emit queue metrics |
26 | | - # |
27 | | - # This calculates various queue statistics and emits individual metrics for each. |
28 | | - def emit_queue_metrics |
29 | | - metrics = calculate_queue_metrics |
| 30 | + emit_metric('journaled.worker.queue_total_count', value: metrics[:total_count], worker_id:) |
| 31 | + emit_metric('journaled.worker.queue_workable_count', value: metrics[:workable_count], worker_id:) |
| 32 | + emit_metric('journaled.worker.queue_failed_count', value: metrics[:failed_count], worker_id:) |
| 33 | + emit_metric('journaled.worker.queue_oldest_age_seconds', value: metrics[:oldest_age_seconds], worker_id:) |
30 | 34 |
|
31 | | - emit_metric('journaled.worker.queue_total_count', value: metrics[:total_count]) |
32 | | - emit_metric('journaled.worker.queue_workable_count', value: metrics[:workable_count]) |
33 | | - emit_metric('journaled.worker.queue_erroring_count', value: metrics[:erroring_count]) |
34 | | - emit_metric('journaled.worker.queue_oldest_age_seconds', value: metrics[:oldest_age_seconds]) |
| 35 | + Rails.logger.info( |
| 36 | + "Queue metrics: total=#{metrics[:total_count]}, " \ |
| 37 | + "workable=#{metrics[:workable_count]}, " \ |
| 38 | + "failed=#{metrics[:failed_count]}, " \ |
| 39 | + "oldest_age=#{metrics[:oldest_age_seconds].round(2)}s", |
| 40 | + ) |
| 41 | + end |
35 | 42 |
|
36 | | - Rails.logger.info( |
37 | | - "Queue metrics: total=#{metrics[:total_count]}, " \ |
38 | | - "workable=#{metrics[:workable_count]}, " \ |
39 | | - "erroring=#{metrics[:erroring_count]}, " \ |
40 | | - "oldest_age=#{metrics[:oldest_age_seconds].round(2)}s", |
41 | | - ) |
42 | | - end |
| 43 | + # Emit a metric notification for a Kinesis send failure |
| 44 | + # |
| 45 | + # @param event [Journaled::Outbox::Event] The failed event |
| 46 | + # @param error_code [String] The error code (e.g., 'ProvisionedThroughputExceededException') |
| 47 | + def emit_kinesis_failure(event:, error_code:) |
| 48 | + emit_metric( |
| 49 | + 'journaled.kinesis.send_failure', |
| 50 | + partition_key: event.partition_key, |
| 51 | + error_code:, |
| 52 | + stream_name: event.stream_name, |
| 53 | + event_type: event.event_type, |
| 54 | + ) |
| 55 | + end |
43 | 56 |
|
44 | | - private |
| 57 | + private |
45 | 58 |
|
46 | | - attr_reader :worker_id |
47 | | - |
48 | | - # Emit a single metric notification |
49 | | - # |
50 | | - # @param event_name [String] The name of the metric event |
51 | | - # @param payload [Hash] Additional payload data (event_count, value, etc.) |
52 | | - def emit_metric(event_name, payload) |
53 | | - ActiveSupport::Notifications.instrument( |
54 | | - event_name, |
55 | | - payload.merge(worker_id:), |
56 | | - ) |
57 | | - end |
| 59 | + # Emit a single metric notification |
| 60 | + # |
| 61 | + # @param event_name [String] The name of the metric event |
| 62 | + # @param payload [Hash] Additional payload data (event_count, value, etc.) |
| 63 | + def emit_metric(event_name, payload) |
| 64 | + ActiveSupport::Notifications.instrument(event_name, payload) |
| 65 | + end |
58 | 66 |
|
59 | | - # Calculate queue metrics |
60 | | - # |
61 | | - # @return [Hash] Metrics including counts and oldest event timestamp |
62 | | - def calculate_queue_metrics |
63 | | - # Use a single query with COUNT(*) FILTER to calculate all counts in one table scan |
64 | | - result = Event.connection.select_one( |
65 | | - Event.select( |
66 | | - 'COUNT(*) AS total_count', |
67 | | - 'COUNT(*) FILTER (WHERE failed_at IS NULL) AS workable_count', |
68 | | - 'COUNT(*) FILTER (WHERE failure_reason IS NOT NULL AND failed_at IS NULL) AS erroring_count', |
69 | | - 'MIN(created_at) FILTER (WHERE failed_at IS NULL) AS oldest_non_failed_timestamp', |
70 | | - ).to_sql, |
71 | | - ) |
| 67 | + # Calculate queue metrics |
| 68 | + # |
| 69 | + # @return [Hash] Metrics including counts and oldest event timestamp |
| 70 | + def calculate_queue_metrics |
| 71 | + # Use a single query with COUNT(*) FILTER to calculate all counts in one table scan |
| 72 | + result = Event.connection.select_one( |
| 73 | + Event.select( |
| 74 | + 'COUNT(*) AS total_count', |
| 75 | + 'COUNT(*) FILTER (WHERE failed_at IS NULL) AS workable_count', |
| 76 | + 'COUNT(*) FILTER (WHERE failure_reason IS NOT NULL AND failed_at IS NULL) AS failed_count', |
| 77 | + 'MIN(created_at) FILTER (WHERE failed_at IS NULL) AS oldest_non_failed_timestamp', |
| 78 | + ).to_sql, |
| 79 | + ) |
72 | 80 |
|
73 | | - oldest_timestamp = result['oldest_non_failed_timestamp'] |
74 | | - oldest_age_seconds = oldest_timestamp ? Time.current - oldest_timestamp : 0 |
| 81 | + oldest_timestamp = result['oldest_non_failed_timestamp'] |
| 82 | + oldest_age_seconds = oldest_timestamp ? Time.current - oldest_timestamp : 0 |
75 | 83 |
|
76 | | - { |
77 | | - total_count: result['total_count'], |
78 | | - workable_count: result['workable_count'], |
79 | | - erroring_count: result['erroring_count'], |
80 | | - oldest_age_seconds:, |
81 | | - } |
| 84 | + { |
| 85 | + total_count: result['total_count'], |
| 86 | + workable_count: result['workable_count'], |
| 87 | + failed_count: result['failed_count'], |
| 88 | + oldest_age_seconds:, |
| 89 | + } |
| 90 | + end |
82 | 91 | end |
83 | 92 | end |
84 | 93 | end |
|
0 commit comments