Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This file was automatically copied from notifications-utils@99.8.0
# This file was automatically copied from notifications-utils@100.1.0

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
2 changes: 2 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from gds_metrics.metrics import Gauge, Histogram
from notifications_utils import request_helper
from notifications_utils.celery import NotifyCelery
from notifications_utils.clients.otel.otel_client import init_otel_app
from notifications_utils.clients.redis.redis_client import RedisClient
from notifications_utils.clients.signing.signing_client import Signing
from notifications_utils.clients.statsd.statsd_client import StatsdClient
Expand Down Expand Up @@ -171,6 +172,7 @@ def create_app(application):
init_app(application)

# Metrics intentionally high up to give the most accurate timing and reliability that the metric is recorded
init_otel_app(application)
metrics.init_app(application)
request_helper.init_app(application)
db.init_app(application)
Expand Down
14 changes: 14 additions & 0 deletions app/celery/nightly_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
is_dvla_working_day,
)
from notifications_utils.timezones import convert_utc_to_bst
from opentelemetry import metrics
from sqlalchemy import func
from sqlalchemy.exc import SQLAlchemyError

Expand Down Expand Up @@ -54,6 +55,13 @@
)
from app.utils import get_london_midnight_in_utc

meter = metrics.get_meter(__name__)

otel_timeout_sending_counter = meter.create_counter(
"timeout_sending",
description="Notifications counter that have timed out sending",
)


@notify_celery.task(name="remove_sms_email_jobs")
@cronitor("remove_sms_email_jobs")
Expand Down Expand Up @@ -250,6 +258,12 @@ def timeout_notifications():

for notification in notifications:
statsd_client.incr(f"timeout-sending.{notification.sent_by}")
otel_timeout_sending_counter.add(
1,
{
"notification_send_by": notification.sent_by,
},
)
check_and_queue_callback_task(notification)

current_app.logger.info(
Expand Down
30 changes: 29 additions & 1 deletion app/celery/process_ses_receipts_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import iso8601
from celery.exceptions import Retry
from flask import current_app, json
from notifications_utils.clients.otel.utils import default_histogram_bucket
from opentelemetry import metrics
from sqlalchemy.orm.exc import NoResultFound

from app import notify_celery, statsd_client
Expand All @@ -18,6 +20,20 @@
handle_complaint,
)

meter = metrics.get_meter(__name__)

otel_ses_notification_processing_histogram = meter.create_histogram(
"ses_notification_processing",
description="Time taken to process an SES notification in seconds",
explicit_bucket_boundaries_advisory=default_histogram_bucket,
unit="seconds",
)

otel_ses_callback_counter = meter.create_counter(
"ses_callback",
description="Amount of non-duplicate ses callbacks processed",
)


@notify_celery.task(
bind=True, name="process-ses-result", max_retries=5, default_retry_delay=300, early_log_level=logging.DEBUG
Expand Down Expand Up @@ -78,12 +94,24 @@ def process_ses_results(self, response):
)

statsd_client.incr(f"callback.ses.{notification_status}")

otel_ses_callback_counter.add(
1,
{
"notification_status": notification_status,
},
)
if notification.sent_at:
statsd_client.timing_with_dates(
f"callback.ses.{notification_status}.elapsed-time", datetime.utcnow(), notification.sent_at
)

otel_ses_notification_processing_histogram.record(
(datetime.utcnow() - notification.sent_at).total_seconds(),
{
"notification_status": notification_status,
},
)

check_and_queue_callback_task(notification)

return True
Expand Down
31 changes: 31 additions & 0 deletions app/celery/process_sms_client_response_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from datetime import datetime

from flask import current_app
from notifications_utils.clients.otel.utils import default_histogram_bucket
from notifications_utils.template import SMSMessageTemplate
from opentelemetry import metrics

from app import notify_celery, statsd_client
from app.clients import ClientException
Expand All @@ -21,6 +23,20 @@
"Firetext": get_firetext_responses,
}

meter = metrics.get_meter(__name__)

otel_provider_callback_completed = meter.create_histogram(
"provider_sms",
description="Time for sms sends to complete in seconds",
explicit_bucket_boundaries_advisory=default_histogram_bucket,
unit="seconds",
)

otel_sms_international = meter.create_counter(
"international_sms",
description="Count of provider callbacks",
)


@notify_celery.task(
bind=True, name="process-sms-client-response", max_retries=5, default_retry_delay=300, early_log_level=logging.DEBUG
Expand Down Expand Up @@ -81,6 +97,14 @@ def _process_for_status(notification_status, client_name, provider_reference, de
notification.sent_at,
)

otel_provider_callback_completed.record(
(datetime.utcnow() - notification.sent_at).total_seconds(),
{
"client_name": client_name.lower(),
"notification_status": notification_status,
},
)

if notification.billable_units == 0:
service = notification.service
template_model = dao_get_template_by_id(notification.template_id, notification.template_version)
Expand All @@ -98,3 +122,10 @@ def _process_for_status(notification_status, client_name, provider_reference, de
check_and_queue_callback_task(notification)
if notification.international:
statsd_client.incr(f"international-sms.{notification_status}.{notification.phone_prefix}")
otel_sms_international.add(
1,
{
"notification_status": notification_status,
"phone_prefix": notification.phone_prefix,
},
)
17 changes: 17 additions & 0 deletions app/celery/scheduled_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
NotifyTicketType,
)
from notifications_utils.timezones import convert_utc_to_bst
from opentelemetry import metrics
from redis.exceptions import LockError
from sqlalchemy import and_, between
from sqlalchemy.exc import SQLAlchemyError
Expand Down Expand Up @@ -89,6 +90,13 @@
from app.notifications.process_notifications import persist_notification, send_notification_to_queue
from app.utils import get_london_midnight_in_utc

meter = metrics.get_meter(__name__)

otel_slow_delivery_ratio_gauge = meter.create_gauge(
"slow_delivery_ratio",
description="Ratio of slow delivery notifications",
)


@notify_celery.task(name="run-scheduled-jobs")
@cronitor("run-scheduled-jobs")
Expand Down Expand Up @@ -204,6 +212,7 @@ def _check_slow_text_message_delivery_reports_and_raise_error_if_needed(reports:
redis_store.set(CacheKeys.NUMBER_OF_TIMES_OVER_SLOW_SMS_DELIVERY_THRESHOLD, 0)


# TODO: We are reimplementing a histogram here. We should just use that instead.
@notify_celery.task(name="generate-sms-delivery-stats")
def generate_sms_delivery_stats():
for delivery_interval in (1, 5, 10):
Expand All @@ -215,11 +224,19 @@ def generate_sms_delivery_stats():
statsd_client.gauge(
f"slow-delivery.{report.provider}.delivered-within-minutes.{delivery_interval}.ratio", report.slow_ratio
)
otel_slow_delivery_ratio_gauge.set(
report.slow_ratio,
attributes={
"provider": report.provider,
"delivery_interval": delivery_interval,
},
)

total_notifications = sum(report.total_notifications for report in providers_slow_delivery_reports)
slow_notifications = sum(report.slow_notifications for report in providers_slow_delivery_reports)
ratio_slow_notifications = slow_notifications / total_notifications

# Not recording an overall otel metric here as this can be calculated
statsd_client.gauge(
f"slow-delivery.sms.delivered-within-minutes.{delivery_interval}.ratio", ratio_slow_notifications
)
Expand Down
4 changes: 4 additions & 0 deletions app/clients/email/aws_ses.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import boto3
import botocore
from flask import current_app
from notifications_utils.clients.otel.utils import otel_duration_histogram

from app.clients import STATISTICS_DELIVERED, STATISTICS_FAILURE
from app.clients.email import (
Expand Down Expand Up @@ -65,6 +66,9 @@ def __init__(self, region, statsd_client):
self._client = boto3.client("sesv2", region_name=region)
self.statsd_client = statsd_client

@otel_duration_histogram(
"aws_ses_send_email_duration", description="Time taken to send an email using AWS SES", unit="seconds"
)
def send_email(
self,
*,
Expand Down
2 changes: 2 additions & 0 deletions app/clients/sms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from time import monotonic

import requests
from notifications_utils.clients.otel.utils import otel_duration_histogram
from urllib3.connection import HTTPConnection

from app.clients import Client, ClientException
Expand Down Expand Up @@ -55,6 +56,7 @@ def record_outcome(self, success):
"Provider request for %s %s", self.name, "succeeded" if success else "failed"
)

@otel_duration_histogram("send_sms", attributes=lambda args, kwargs: {"provider_name": args[0].name})
def send_sms(self, to, content, reference, international, sender):
start_time = monotonic()

Expand Down
8 changes: 8 additions & 0 deletions app/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
from click_datetime import Datetime as click_dt
from dateutil import rrule
from flask import current_app, json
from notifications_utils.clients.otel.utils import otel_duration_histogram, otel_span_with_status
from notifications_utils.recipients import RecipientCSV
from notifications_utils.statsd_decorators import statsd
from notifications_utils.template import SMSMessageTemplate
from opentelemetry.trace import get_tracer
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm.exc import NoResultFound

Expand Down Expand Up @@ -397,6 +399,8 @@ def bulk_invite_user_to_service(file_name, service_id, user_id, auth_type, permi
"-s", "--start_date", default=datetime(2017, 2, 1), help="start date inclusive", type=click_dt(format="%Y-%m-%d")
)
@statsd(namespace="tasks")
@otel_duration_histogram("populate_notification_postage_duration")
@otel_span_with_status(get_tracer(__name__), "populate_notification_postage")
def populate_notification_postage(start_date):
current_app.logger.info("populating historical notification postage")

Expand Down Expand Up @@ -442,6 +446,8 @@ def populate_notification_postage(start_date):
@click.option("-s", "--start_date", required=True, help="start date inclusive", type=click_dt(format="%Y-%m-%d"))
@click.option("-e", "--end_date", required=True, help="end date inclusive", type=click_dt(format="%Y-%m-%d"))
@statsd(namespace="tasks")
@otel_duration_histogram("update_jobs_archived_flag")
@otel_span_with_status(get_tracer(__name__), "populate_notification_postage")
def update_jobs_archived_flag(start_date, end_date):
current_app.logger.info("Archiving jobs created between %s to %s", start_date, end_date)

Expand Down Expand Up @@ -475,6 +481,8 @@ def update_jobs_archived_flag(start_date, end_date):
@notify_command(name="update-emails-to-remove-gsi")
@click.option("-s", "--service_id", required=True, help="service id. Update all user.email_address to remove .gsi")
@statsd(namespace="tasks")
@otel_duration_histogram("update_emails_to_remove_gsi")
@otel_span_with_status(get_tracer(__name__), "populate_notification_postage")
def update_emails_to_remove_gsi(service_id):
users_to_update = """SELECT u.id user_id, u.name, email_address, s.id, s.name
FROM users u
Expand Down
5 changes: 4 additions & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from celery.schedules import crontab
from kombu import Exchange, Queue
from notifications_utils.config import BaseConfig


class QueueNames:
Expand Down Expand Up @@ -88,7 +89,7 @@ class TaskNames:
RECREATE_PDF_FOR_PRECOMPILED_LETTER = "recreate-pdf-for-precompiled-letter"


class Config:
class Config(BaseConfig):
# URL of admin app
ADMIN_BASE_URL = os.getenv("ADMIN_BASE_URL", "http://localhost:6012")

Expand All @@ -100,6 +101,8 @@ class Config:
CELERY_WORKER_LOG_LEVEL = os.getenv("CELERY_WORKER_LOG_LEVEL", "CRITICAL").upper()
CELERY_BEAT_LOG_LEVEL = os.getenv("CELERY_BEAT_LOG_LEVEL", "INFO").upper()

OTEL_EXPORT_TYPE = os.environ.get("OTEL_EXPORT_TYPE", "otlp").lower().strip()

# secrets that internal apps, such as the admin app or document download, must use to authenticate with the API
ADMIN_CLIENT_ID = "notify-admin"
FUNCTIONAL_TESTS_CLIENT_ID = "notify-functional-tests"
Expand Down
3 changes: 2 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ psutil>=6.0.0,<7.0.0
notifications-python-client==10.0.1

# Run `make bump-utils` to update to the latest version
notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@99.8.0
#notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@99.8.0
notifications-utils[otel-instrumentation] @ git+https://github.com/alphagov/notifications-utils.git@c59a85f44ad83ee0d4e4f09499b5b58b5239cf2c

# gds-metrics requires prometheseus 0.2.0, override that requirement as 0.7.1 brings significant performance gains
prometheus-client==0.14.1
Expand Down
Loading