Skip to content

Commit b40555e

Browse files
author
Ben Corlett
committed
Add initial otel spans and metrics
1 parent dbf5679 commit b40555e

15 files changed

+389
-10
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This file was automatically copied from notifications-utils@99.8.0
1+
# This file was automatically copied from notifications-utils@100.1.0
22

33
repos:
44
- repo: https://github.com/pre-commit/pre-commit-hooks

app/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from gds_metrics.metrics import Gauge, Histogram
2424
from notifications_utils import request_helper
2525
from notifications_utils.celery import NotifyCelery
26+
from notifications_utils.clients.otel.otel_client import init_otel_app
2627
from notifications_utils.clients.redis.redis_client import RedisClient
2728
from notifications_utils.clients.signing.signing_client import Signing
2829
from notifications_utils.clients.statsd.statsd_client import StatsdClient
@@ -171,6 +172,7 @@ def create_app(application):
171172
init_app(application)
172173

173174
# Metrics intentionally high up to give the most accurate timing and reliability that the metric is recorded
175+
init_otel_app(application)
174176
metrics.init_app(application)
175177
request_helper.init_app(application)
176178
db.init_app(application)

app/celery/nightly_tasks.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
is_dvla_working_day,
1111
)
1212
from notifications_utils.timezones import convert_utc_to_bst
13+
from opentelemetry import metrics
1314
from sqlalchemy import func
1415
from sqlalchemy.exc import SQLAlchemyError
1516

@@ -54,6 +55,13 @@
5455
)
5556
from app.utils import get_london_midnight_in_utc
5657

58+
meter = metrics.get_meter(__name__)
59+
60+
otel_timeout_sending_counter = meter.create_counter(
61+
"timeout_sending",
62+
description="Notifications counter that have timed out sending",
63+
)
64+
5765

5866
@notify_celery.task(name="remove_sms_email_jobs")
5967
@cronitor("remove_sms_email_jobs")
@@ -250,6 +258,12 @@ def timeout_notifications():
250258

251259
for notification in notifications:
252260
statsd_client.incr(f"timeout-sending.{notification.sent_by}")
261+
otel_timeout_sending_counter.add(
262+
1,
263+
{
264+
"notification_send_by": notification.sent_by,
265+
},
266+
)
253267
check_and_queue_callback_task(notification)
254268

255269
current_app.logger.info(

app/celery/process_ses_receipts_tasks.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import iso8601
55
from celery.exceptions import Retry
66
from flask import current_app, json
7+
from notifications_utils.clients.otel.utils import default_histogram_bucket
8+
from opentelemetry import metrics
79
from sqlalchemy.orm.exc import NoResultFound
810

911
from app import notify_celery, statsd_client
@@ -18,6 +20,20 @@
1820
handle_complaint,
1921
)
2022

23+
meter = metrics.get_meter(__name__)
24+
25+
otel_ses_notification_processing_histogram = meter.create_histogram(
26+
"ses_notification_processing",
27+
description="Time taken to process an SES notification in seconds",
28+
explicit_bucket_boundaries_advisory=default_histogram_bucket,
29+
unit="seconds",
30+
)
31+
32+
otel_ses_callback_counter = meter.create_counter(
33+
"ses_callback",
34+
description="Amount of non-duplicate ses callbacks processed",
35+
)
36+
2137

2238
@notify_celery.task(
2339
bind=True, name="process-ses-result", max_retries=5, default_retry_delay=300, early_log_level=logging.DEBUG
@@ -78,12 +94,24 @@ def process_ses_results(self, response):
7894
)
7995

8096
statsd_client.incr(f"callback.ses.{notification_status}")
81-
97+
otel_ses_callback_counter.add(
98+
1,
99+
{
100+
"notification_status": notification_status,
101+
},
102+
)
82103
if notification.sent_at:
83104
statsd_client.timing_with_dates(
84105
f"callback.ses.{notification_status}.elapsed-time", datetime.utcnow(), notification.sent_at
85106
)
86107

108+
otel_ses_notification_processing_histogram.record(
109+
(datetime.utcnow() - notification.sent_at).total_seconds(),
110+
{
111+
"notification_status": notification_status,
112+
},
113+
)
114+
87115
check_and_queue_callback_task(notification)
88116

89117
return True

app/celery/process_sms_client_response_tasks.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
from datetime import datetime
44

55
from flask import current_app
6+
from notifications_utils.clients.otel.utils import default_histogram_bucket
67
from notifications_utils.template import SMSMessageTemplate
8+
from opentelemetry import metrics
79

810
from app import notify_celery, statsd_client
911
from app.clients import ClientException
@@ -21,6 +23,20 @@
2123
"Firetext": get_firetext_responses,
2224
}
2325

26+
meter = metrics.get_meter(__name__)
27+
28+
otel_provider_callback_completed = meter.create_histogram(
29+
"provider_sms",
30+
description="Time for sms sends to complete in seconds",
31+
explicit_bucket_boundaries_advisory=default_histogram_bucket,
32+
unit="seconds",
33+
)
34+
35+
otel_sms_international = meter.create_counter(
36+
"international_sms",
37+
description="Count of provider callbacks",
38+
)
39+
2440

2541
@notify_celery.task(
2642
bind=True, name="process-sms-client-response", max_retries=5, default_retry_delay=300, early_log_level=logging.DEBUG
@@ -81,6 +97,14 @@ def _process_for_status(notification_status, client_name, provider_reference, de
8197
notification.sent_at,
8298
)
8399

100+
otel_provider_callback_completed.record(
101+
(datetime.utcnow() - notification.sent_at).total_seconds(),
102+
{
103+
"client_name": client_name.lower(),
104+
"notification_status": notification_status,
105+
},
106+
)
107+
84108
if notification.billable_units == 0:
85109
service = notification.service
86110
template_model = dao_get_template_by_id(notification.template_id, notification.template_version)
@@ -98,3 +122,10 @@ def _process_for_status(notification_status, client_name, provider_reference, de
98122
check_and_queue_callback_task(notification)
99123
if notification.international:
100124
statsd_client.incr(f"international-sms.{notification_status}.{notification.phone_prefix}")
125+
otel_sms_international.add(
126+
1,
127+
{
128+
"notification_status": notification_status,
129+
"phone_prefix": notification.phone_prefix,
130+
},
131+
)

app/celery/scheduled_tasks.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
NotifyTicketType,
1515
)
1616
from notifications_utils.timezones import convert_utc_to_bst
17+
from opentelemetry import metrics
1718
from redis.exceptions import LockError
1819
from sqlalchemy import and_, between
1920
from sqlalchemy.exc import SQLAlchemyError
@@ -89,6 +90,13 @@
8990
from app.notifications.process_notifications import persist_notification, send_notification_to_queue
9091
from app.utils import get_london_midnight_in_utc
9192

93+
meter = metrics.get_meter(__name__)
94+
95+
otel_slow_delivery_ratio_gauge = meter.create_gauge(
96+
"slow_delivery_ratio",
97+
description="Ratio of slow delivery notifications",
98+
)
99+
92100

93101
@notify_celery.task(name="run-scheduled-jobs")
94102
@cronitor("run-scheduled-jobs")
@@ -204,6 +212,7 @@ def _check_slow_text_message_delivery_reports_and_raise_error_if_needed(reports:
204212
redis_store.set(CacheKeys.NUMBER_OF_TIMES_OVER_SLOW_SMS_DELIVERY_THRESHOLD, 0)
205213

206214

215+
# TODO: We are reimplementing a histogram here. We should just use that instead.
207216
@notify_celery.task(name="generate-sms-delivery-stats")
208217
def generate_sms_delivery_stats():
209218
for delivery_interval in (1, 5, 10):
@@ -215,11 +224,19 @@ def generate_sms_delivery_stats():
215224
statsd_client.gauge(
216225
f"slow-delivery.{report.provider}.delivered-within-minutes.{delivery_interval}.ratio", report.slow_ratio
217226
)
227+
otel_slow_delivery_ratio_gauge.set(
228+
report.slow_ratio,
229+
attributes={
230+
"provider": report.provider,
231+
"delivery_interval": delivery_interval,
232+
},
233+
)
218234

219235
total_notifications = sum(report.total_notifications for report in providers_slow_delivery_reports)
220236
slow_notifications = sum(report.slow_notifications for report in providers_slow_delivery_reports)
221237
ratio_slow_notifications = slow_notifications / total_notifications
222238

239+
# Not recording an overall otel metric here as this can be calculated
223240
statsd_client.gauge(
224241
f"slow-delivery.sms.delivered-within-minutes.{delivery_interval}.ratio", ratio_slow_notifications
225242
)

app/clients/email/aws_ses.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import boto3
44
import botocore
55
from flask import current_app
6+
from notifications_utils.clients.otel.utils import otel_duration_histogram
67

78
from app.clients import STATISTICS_DELIVERED, STATISTICS_FAILURE
89
from app.clients.email import (
@@ -65,6 +66,9 @@ def __init__(self, region, statsd_client):
6566
self._client = boto3.client("sesv2", region_name=region)
6667
self.statsd_client = statsd_client
6768

69+
@otel_duration_histogram(
70+
"aws_ses_send_email_duration", description="Time taken to send an email using AWS SES", unit="seconds"
71+
)
6872
def send_email(
6973
self,
7074
*,

app/clients/sms/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from time import monotonic
55

66
import requests
7+
from notifications_utils.clients.otel.utils import otel_duration_histogram
78
from urllib3.connection import HTTPConnection
89

910
from app.clients import Client, ClientException
@@ -55,6 +56,7 @@ def record_outcome(self, success):
5556
"Provider request for %s %s", self.name, "succeeded" if success else "failed"
5657
)
5758

59+
@otel_duration_histogram("send_sms", attributes=lambda args, kwargs: {"provider_name": args[0].name})
5860
def send_sms(self, to, content, reference, international, sender):
5961
start_time = monotonic()
6062

app/commands.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
from click_datetime import Datetime as click_dt
1717
from dateutil import rrule
1818
from flask import current_app, json
19+
from notifications_utils.clients.otel.utils import otel_duration_histogram, otel_span_with_status
1920
from notifications_utils.recipients import RecipientCSV
2021
from notifications_utils.statsd_decorators import statsd
2122
from notifications_utils.template import SMSMessageTemplate
23+
from opentelemetry.trace import get_tracer
2224
from sqlalchemy.exc import IntegrityError
2325
from sqlalchemy.orm.exc import NoResultFound
2426

@@ -397,6 +399,8 @@ def bulk_invite_user_to_service(file_name, service_id, user_id, auth_type, permi
397399
"-s", "--start_date", default=datetime(2017, 2, 1), help="start date inclusive", type=click_dt(format="%Y-%m-%d")
398400
)
399401
@statsd(namespace="tasks")
402+
@otel_duration_histogram("populate_notification_postage_duration")
403+
@otel_span_with_status(get_tracer(__name__), "populate_notification_postage")
400404
def populate_notification_postage(start_date):
401405
current_app.logger.info("populating historical notification postage")
402406

@@ -442,6 +446,8 @@ def populate_notification_postage(start_date):
442446
@click.option("-s", "--start_date", required=True, help="start date inclusive", type=click_dt(format="%Y-%m-%d"))
443447
@click.option("-e", "--end_date", required=True, help="end date inclusive", type=click_dt(format="%Y-%m-%d"))
444448
@statsd(namespace="tasks")
449+
@otel_duration_histogram("update_jobs_archived_flag")
450+
@otel_span_with_status(get_tracer(__name__), "populate_notification_postage")
445451
def update_jobs_archived_flag(start_date, end_date):
446452
current_app.logger.info("Archiving jobs created between %s to %s", start_date, end_date)
447453

@@ -475,6 +481,8 @@ def update_jobs_archived_flag(start_date, end_date):
475481
@notify_command(name="update-emails-to-remove-gsi")
476482
@click.option("-s", "--service_id", required=True, help="service id. Update all user.email_address to remove .gsi")
477483
@statsd(namespace="tasks")
484+
@otel_duration_histogram("update_emails_to_remove_gsi")
485+
@otel_span_with_status(get_tracer(__name__), "populate_notification_postage")
478486
def update_emails_to_remove_gsi(service_id):
479487
users_to_update = """SELECT u.id user_id, u.name, email_address, s.id, s.name
480488
FROM users u

app/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from celery.schedules import crontab
66
from kombu import Exchange, Queue
7+
from notifications_utils.config import BaseConfig
78

89

910
class QueueNames:
@@ -88,7 +89,7 @@ class TaskNames:
8889
RECREATE_PDF_FOR_PRECOMPILED_LETTER = "recreate-pdf-for-precompiled-letter"
8990

9091

91-
class Config:
92+
class Config(BaseConfig):
9293
# URL of admin app
9394
ADMIN_BASE_URL = os.getenv("ADMIN_BASE_URL", "http://localhost:6012")
9495

@@ -100,6 +101,8 @@ class Config:
100101
CELERY_WORKER_LOG_LEVEL = os.getenv("CELERY_WORKER_LOG_LEVEL", "CRITICAL").upper()
101102
CELERY_BEAT_LOG_LEVEL = os.getenv("CELERY_BEAT_LOG_LEVEL", "INFO").upper()
102103

104+
OTEL_EXPORT_TYPE = os.environ.get("OTEL_EXPORT_TYPE", "otlp").lower().strip()
105+
103106
# secrets that internal apps, such as the admin app or document download, must use to authenticate with the API
104107
ADMIN_CLIENT_ID = "notify-admin"
105108
FUNCTIONAL_TESTS_CLIENT_ID = "notify-functional-tests"

0 commit comments

Comments
 (0)