diff --git a/docs/infrastructure/metrics-and-logging.md b/docs/infrastructure/metrics-and-logging.md new file mode 100644 index 000000000..7e9df7983 --- /dev/null +++ b/docs/infrastructure/metrics-and-logging.md @@ -0,0 +1,49 @@ +# Metrics and logging + +Some processes were setup for the `notifications` code to observe logs, events and metrics + +## Application Insights + +The ApplicationInsightsLogging service class sets up logs to feed into Application Insights, which can then be called like so: + +```python +class CommandHandler: + @contextmanager + @staticmethod + def handle(command_name): + try: + yield + ApplicationInsightsLogging().custom_event_info( + event_name=f"{command_name}Completed", + message=f"{command_name} completed successfully", + ) + except Exception as e: + ApplicationInsightsLogging().exception(f"{command_name}Error: {e}") + raise CommandError(e) +``` + +The Application Insights resource is setup in by the `app_insights_audit` module in terraform: + +```terraform +module "app_insights_audit" { + source = "../dtos-devops-templates/infrastructure/modules/app-insights" + + name = module.shared_config.names.app-insights + location = var.region + resource_group_name = azurerm_resource_group.main.name + appinsights_type = "web" + + log_analytics_workspace_id = module.log_analytics_workspace_audit.id + + # alerts + action_group_id = var.action_group_id + enable_alerting = var.enable_alerting +} +``` + +## Metrics + +This PR introduced a Metrics service class and command to collect information about queue sizes + + +The metrics would feed through to the Application Insights resource and could be viewed in the Portal. diff --git a/infrastructure/modules/container-apps/alerts.tf b/infrastructure/modules/container-apps/alerts.tf index f1e02d55a..a9ea412bd 100644 --- a/infrastructure/modules/container-apps/alerts.tf +++ b/infrastructure/modules/container-apps/alerts.tf @@ -36,55 +36,3 @@ resource "azurerm_monitor_scheduled_query_rules_alert_v2" "failure_event" { } } } - -# IMPORTANT: -# Enable metrics store with all dimensions: https://docs.azure.cn/en-us/azure-monitor/app/metrics-overview?tabs=standard#custom-metrics-dimensions-and-preaggregation -# currently this feature is in preview. -resource "azurerm_monitor_scheduled_query_rules_alert_v2" "queue_length_high" { - for_each = var.enable_alerting ? toset([ - "notifications-message-batch-retries" - ]) : [] - - name = "${var.app_short_name}-${each.key}-${var.environment}-queue-length-high-alert" - location = var.region - resource_group_name = azurerm_resource_group.main.name - - auto_mitigation_enabled = true - description = "Alert when queue length exceeds ${var.queue_length_alert_threshold}" - display_name = "${var.app_short_name} Notifications Queue Length High Alert" - enabled = true - severity = 2 - evaluation_frequency = "PT10M" - window_duration = "PT10M" - scopes = [var.app_insights_id] - - criteria { - query = <<-KQL - customMetrics - | where name == "${each.key}" - | extend environment = tostring(customDimensions.environment) - | where environment == "${var.environment}" - | extend value = toreal(value) - | summarize avg_value = avg(value) by bin(timestamp, 5m) - | where avg_value > ${var.queue_length_alert_threshold} - KQL - - metric_measure_column = "avg_value" - time_aggregation_method = "Average" - operator = "GreaterThan" - threshold = 0 - - failing_periods { - minimum_failing_periods_to_trigger_alert = 1 - number_of_evaluation_periods = 1 - } - } - - action { - action_groups = [var.action_group_id] - } - - tags = { - environment = var.environment - } -} diff --git a/infrastructure/modules/container-apps/jobs.tf b/infrastructure/modules/container-apps/jobs.tf index 467a72759..d462f45c5 100644 --- a/infrastructure/modules/container-apps/jobs.tf +++ b/infrastructure/modules/container-apps/jobs.tf @@ -33,15 +33,6 @@ locals { job_short_name = "smk" job_container_args = "create_reports --smoke-test" } - collect_metrics = { - cron_expression = "*/5 * * * *" - environment_variables = { - RETRY_QUEUE_NAME = "notifications-message-batch-retries" - ENVIRONMENT = var.environment - } - job_short_name = "clm" - job_container_args = "collect_metrics" - } } } diff --git a/infrastructure/modules/container-apps/storage.tf b/infrastructure/modules/container-apps/storage.tf index 22471875a..a67b8eafe 100644 --- a/infrastructure/modules/container-apps/storage.tf +++ b/infrastructure/modules/container-apps/storage.tf @@ -33,7 +33,6 @@ module "storage" { private_endpoint_resource_group_name = azurerm_resource_group.main.name private_service_connection_is_manual = false } - queues = local.storage_queues resource_group_name = azurerm_resource_group.main.name } diff --git a/infrastructure/modules/container-apps/variables.tf b/infrastructure/modules/container-apps/variables.tf index 9fc4eca9e..2bac497dc 100644 --- a/infrastructure/modules/container-apps/variables.tf +++ b/infrastructure/modules/container-apps/variables.tf @@ -199,10 +199,6 @@ variable "app_insights_id" { type = string } -variable "queue_length_alert_threshold" { - description = "If alerting is enabled, alert if storage account queues are greater than this threshold." - type = number -} variable "enable_notifications_jobs_schedule" { description = "Whether we apply the cron schedules for the notifications container app jobs" @@ -259,7 +255,6 @@ locals { container_access_type = "private" } } - storage_queues = ["notifications-message-batch-retries"] always_allowed_paths = ["/sha", "/healthcheck"] # If allowed_paths is not set, use the module default which allows any pattern diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index bb8d88b35..e03a3a06b 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -79,6 +79,5 @@ module "container-apps" { target_url = var.deploy_container_apps ? "${module.container-apps[0].external_url}healthcheck" : null resource_group_name_infra = local.resource_group_name enable_notifications_jobs_schedule = var.enable_notifications_jobs_schedule - queue_length_alert_threshold = var.queue_length_alert_threshold min_replicas = var.min_replicas } diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index e25c559d0..6ded9bef6 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -185,12 +185,6 @@ variable "run_notifications_smoke_test" { type = bool } -variable "queue_length_alert_threshold" { - description = "If alerting is enabled, alert if storage account queues are greater than this threshold." - type = number - default = 5 -} - locals { region = "uksouth" diff --git a/manage_breast_screening/config/.env.tpl b/manage_breast_screening/config/.env.tpl index fc476ed6f..e00cfd531 100644 --- a/manage_breast_screening/config/.env.tpl +++ b/manage_breast_screening/config/.env.tpl @@ -29,8 +29,6 @@ BASIC_AUTH_USERNAME=changeme BASIC_AUTH_PASSWORD=changeme # Notifications specific env vars -NOTIFICATIONS_BATCH_RETRY_LIMIT=5 - API_OAUTH_API_KEY="" API_OAUTH_API_KID="" API_OAUTH_PRIVATE_KEY="" @@ -41,9 +39,7 @@ NBSS_MESH_INBOX_NAME="paste-mesh-inbox-name-here" NBSS_MESH_PASSWORD="paste-mesh-password-here" NBSS_MESH_CERT="paste-pem-mesh-cert-here" NBSS_MESH_PRIVATE_KEY="paste-pem-private-key-here" -QUEUE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;" REPORTS_CONTAINER_NAME="notifications-reports" -RETRY_QUEUE_NAME="notifications-message-batch-retries" APPLICATIONINSIGHTS_CONNECTION_STRING="" APPLICATIONINSIGHTS_STATSBEAT_DISABLED_ALL=True diff --git a/manage_breast_screening/notifications/README.md b/manage_breast_screening/notifications/README.md index da4644887..34d8a62e7 100644 --- a/manage_breast_screening/notifications/README.md +++ b/manage_breast_screening/notifications/README.md @@ -8,7 +8,7 @@ The application is composed of a set of Django Admin commands which are run on a The commands currently process a feed of data from the active NBSS system. The commands store and then send appointment notifications via NHS Notify. -Storage is handled via Azure blob containers, Azure storage queues and Postgresql. +Storage is handled via Azure blob containers and Postgresql. Appointment notifications are sent 4 weeks prior to the appointment date. Any appointment data processed within 4 weeks of the appointment date will also be eligible for notification on the next scheduled batch. diff --git a/manage_breast_screening/notifications/management/commands/collect_metrics.py b/manage_breast_screening/notifications/management/commands/collect_metrics.py deleted file mode 100644 index c4f55778d..000000000 --- a/manage_breast_screening/notifications/management/commands/collect_metrics.py +++ /dev/null @@ -1,28 +0,0 @@ -import logging - -from django.core.management.base import BaseCommand, CommandError - -from manage_breast_screening.notifications.services.metrics import Metrics -from manage_breast_screening.notifications.services.queue import Queue - -logger = logging.getLogger(__name__) - - -class Command(BaseCommand): - def handle(self, *args, **options): - try: - queues = [ - Queue.RetryMessageBatches(), - ] - - for queue in queues: - Metrics().set_gauge_value( - f"queue_size_{queue.queue_name}", - "messages", - "Queue length", - queue.get_message_count(), - ) - - except Exception as e: - logger.error(e, exc_info=True) - raise CommandError(e) diff --git a/manage_breast_screening/notifications/services/metrics.py b/manage_breast_screening/notifications/services/metrics.py deleted file mode 100644 index 1f075abe0..000000000 --- a/manage_breast_screening/notifications/services/metrics.py +++ /dev/null @@ -1,50 +0,0 @@ -import logging -import os - -from azure.monitor.opentelemetry.exporter import AzureMonitorMetricExporter -from opentelemetry import metrics -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader - -logger = logging.getLogger(__name__) - - -class Metrics: - _instance = None - - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __init__(self): - environment = os.getenv("ENVIRONMENT") - logger.debug((f"Initialising Metrics(environment: {environment})")) - - exporter = AzureMonitorMetricExporter( - connection_string=os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") - ) - metrics.set_meter_provider( - MeterProvider(metric_readers=[PeriodicExportingMetricReader(exporter)]) - ) - self.meter = metrics.get_meter(__name__) - self.environment = environment - - def set_gauge_value(self, metric_name, units, description, value): - logger.debug( - ( - f"Metrics: set_gauge_value(metric_name: {metric_name} " - f"units: {units}, description: {description}, value: {value})" - ) - ) - - # Create gauge metric - gauge = self.meter.create_gauge( - metric_name, unit=units, description=description - ) - - # Set metric value - gauge.set( - value, - {"environment": self.environment}, - ) diff --git a/manage_breast_screening/notifications/services/queue.py b/manage_breast_screening/notifications/services/queue.py deleted file mode 100644 index dbb042041..000000000 --- a/manage_breast_screening/notifications/services/queue.py +++ /dev/null @@ -1,67 +0,0 @@ -import logging -import os - -from azure.core.exceptions import ResourceExistsError -from azure.identity import ManagedIdentityCredential -from azure.storage.queue import QueueClient, QueueMessage - -logger = logging.getLogger(__name__) - - -class QueueConfigurationError(Exception): - """Raised when queue is not properly configured""" - - pass - - -class Queue: - def __init__(self, queue_name): - storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME") - queue_mi_client_id = os.getenv("QUEUE_MI_CLIENT_ID") - connection_string = os.getenv("QUEUE_STORAGE_CONNECTION_STRING") - self.queue_name = queue_name - - if storage_account_name and queue_mi_client_id: - self.client = QueueClient( - f"https://{storage_account_name}.queue.core.windows.net", - queue_name=queue_name, - credential=ManagedIdentityCredential(client_id=queue_mi_client_id), - ) - - elif connection_string: - try: - self.client = QueueClient.from_connection_string( - connection_string, queue_name - ) - self.client.create_queue() - except ResourceExistsError: - pass - else: - raise QueueConfigurationError( - "Queue not configured: Either QUEUE_STORAGE_CONNECTION_STRING or " - "(STORAGE_ACCOUNT_NAME and QUEUE_MI_CLIENT_ID) must be set" - ) - - def add(self, message: str): - if not hasattr(self, "client") or self.client is None: - raise QueueConfigurationError("Queue client not initialized") - self.client.send_message(message) - - def delete(self, message: str | QueueMessage): - self.client.delete_message(message) - - def items(self, limit=50): - return self.client.receive_messages(max_messages=limit) - - def peek(self): - return self.client.peek_messages() - - def item(self): - return self.client.receive_message() - - def get_message_count(self): - return self.client.get_queue_properties().approximate_message_count - - @classmethod - def RetryMessageBatches(cls): - return cls(os.getenv("RETRY_QUEUE_NAME", "notifications-message-batch-retries")) diff --git a/manage_breast_screening/notifications/tests/end_to_end/test_end_to_end.py b/manage_breast_screening/notifications/tests/end_to_end/test_end_to_end.py index cbf71df64..ac91fd0ba 100644 --- a/manage_breast_screening/notifications/tests/end_to_end/test_end_to_end.py +++ b/manage_breast_screening/notifications/tests/end_to_end/test_end_to_end.py @@ -31,7 +31,6 @@ def setup_environment(self): self.env.set("NBSS_MESH_PRIVATE_KEY", "mesh-private-key") self.env.set("NBSS_MESH_CA_CERT", "mesh-ca-cert") self.env.set("BLOB_STORAGE_CONNECTION_STRING", connection_string) - self.env.set("QUEUE_STORAGE_CONNECTION_STRING", connection_string) self.env.set("BLOB_CONTAINER_NAME", "nbss-appoinments-data") Helpers().add_file_to_mesh_mailbox( diff --git a/manage_breast_screening/notifications/tests/integration/helpers.py b/manage_breast_screening/notifications/tests/integration/helpers.py index 0b49eb9cc..ff9ef3b3e 100644 --- a/manage_breast_screening/notifications/tests/integration/helpers.py +++ b/manage_breast_screening/notifications/tests/integration/helpers.py @@ -1,11 +1,7 @@ import os -import time -from contextlib import contextmanager from mesh_client import MeshClient -from manage_breast_screening.notifications.services.queue import Queue - class Helpers: def add_file_to_mesh_mailbox(self, filepath: str): @@ -36,17 +32,3 @@ def azurite_connection_string(self): "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" "QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;" ) - - @contextmanager - def queue_listener(self, queue: Queue, command: callable, delay=1.0): - def queue_count(q): - return sum(1 for i in q.peek()) - - count = queue_count(queue) - - yield - - while count == queue_count(queue): - time.sleep(delay) - - command() diff --git a/manage_breast_screening/notifications/tests/management/commands/test_collect_metrics.py b/manage_breast_screening/notifications/tests/management/commands/test_collect_metrics.py deleted file mode 100644 index 02dc17196..000000000 --- a/manage_breast_screening/notifications/tests/management/commands/test_collect_metrics.py +++ /dev/null @@ -1,34 +0,0 @@ -from unittest.mock import MagicMock, patch - -import pytest - -from manage_breast_screening.notifications.management.commands.collect_metrics import ( - Command, -) - - -class TestCollectMetrics: - @pytest.fixture(autouse=True) - def setup(self, monkeypatch): - monkeypatch.setenv("ENVIRONMENT", "test") - - @patch(f"{Command.__module__}.Queue") - @patch(f"{Command.__module__}.Metrics") - def test_handle_sends_queue_lengths(self, mock_metrics_class, mock_queue): - mock_retry = MagicMock() - mock_retry.queue_name = "retry_queue" - mock_retry.get_message_count.return_value = 5 - - mock_queue.RetryMessageBatches.return_value = mock_retry - - Command().handle() - - metrics_instance = mock_metrics_class.return_value - - assert metrics_instance.set_gauge_value.call_count == 1 - metrics_instance.set_gauge_value.assert_any_call( - "queue_size_retry_queue", - "messages", - "Queue length", - 5, - ) diff --git a/manage_breast_screening/notifications/tests/services/test_metrics.py b/manage_breast_screening/notifications/tests/services/test_metrics.py deleted file mode 100644 index b6d033163..000000000 --- a/manage_breast_screening/notifications/tests/services/test_metrics.py +++ /dev/null @@ -1,92 +0,0 @@ -from unittest.mock import MagicMock, patch - -import pytest - -from manage_breast_screening.notifications.services.metrics import Metrics - - -@patch(f"{Metrics.__module__}.AzureMonitorMetricExporter") -@patch(f"{Metrics.__module__}.PeriodicExportingMetricReader") -@patch(f"{Metrics.__module__}.metrics") -@patch(f"{Metrics.__module__}.MeterProvider") -class TestMetrics: - @pytest.fixture - def conn_string(self): - return "InstrumentationKey=00000000-0000-0000-0000-000000000000" - - @pytest.fixture(autouse=True) - def setup(self, monkeypatch, conn_string): - monkeypatch.setenv("APPLICATIONINSIGHTS_CONNECTION_STRING", conn_string) - monkeypatch.setenv("ENVIRONMENT", "dev") - - def test_metrics_initialisation( - self, - mock_meter_provider, - mock_metrics, - mock_metric_reader, - mock_metric_exporter, - conn_string, - ): - mock_meter = MagicMock() - mock_metrics.get_meter.return_value = mock_meter - - subject = Metrics() - - mock_metric_exporter.assert_called_once_with(connection_string=str(conn_string)) - mock_metric_reader.assert_called_once_with(mock_metric_exporter.return_value) - mock_meter_provider.assert_called_once_with( - metric_readers=[mock_metric_reader.return_value] - ) - mock_metrics.set_meter_provider.assert_called_once_with( - mock_meter_provider.return_value - ) - mock_metrics.get_meter.assert_called_once_with( - "manage_breast_screening.notifications.services.metrics" - ) - - assert subject.meter == mock_meter - assert subject.environment == "dev" - - def test_metrics_is_a_singleton( - self, - mock_meter_provider, - mock_metrics, - mock_reader, - mock_exporter, - ): - subject = Metrics() - the_same_instance = Metrics() - assert subject == the_same_instance - - def test_set_gauge_value( - self, - mock_meter_provider, - mock_metrics, - mock_reader, - mock_exporter, - ): - mock_meter = MagicMock() - mock_gauge = MagicMock() - - mock_metrics.get_meter.return_value = mock_meter - mock_meter.create_gauge.return_value = mock_gauge - - subject = Metrics() - - subject.set_gauge_value( - metric_name="queue_depth", - units="messages", - description="Number of messages", - value=999, - ) - - mock_meter.create_gauge.assert_called_once_with( - "queue_depth", - unit="messages", - description="Number of messages", - ) - - mock_gauge.set.assert_called_once_with( - 999, - {"environment": "dev"}, - ) diff --git a/manage_breast_screening/notifications/tests/services/test_queue.py b/manage_breast_screening/notifications/tests/services/test_queue.py deleted file mode 100644 index 3dd69ee81..000000000 --- a/manage_breast_screening/notifications/tests/services/test_queue.py +++ /dev/null @@ -1,138 +0,0 @@ -from unittest.mock import MagicMock, PropertyMock, patch - -import pytest - -from manage_breast_screening.notifications.services.queue import ( - Queue, - QueueConfigurationError, -) - - -class TestQueue: - @pytest.fixture(autouse=True) - def setup(self, monkeypatch): - monkeypatch.setenv("QUEUE_STORAGE_CONNECTION_STRING", "qqq111") - - @pytest.fixture - def mock_queue_client(self): - with patch( - "manage_breast_screening.notifications.services.queue.QueueClient" - ) as queue_client: - mock_client = MagicMock() - queue_client.from_connection_string.return_value = mock_client - yield mock_client - - def test_queue_is_created(self): - with patch( - "manage_breast_screening.notifications.services.queue.QueueClient" - ) as queue_client: - mock_client = MagicMock() - queue_client.from_connection_string.return_value = mock_client - - Queue("new-queue") - - queue_client.from_connection_string.assert_called_once_with( - "qqq111", "new-queue" - ) - mock_client.create_queue.assert_called_once() - - def test_add_to_queue(self, mock_queue_client): - Queue("new-queue").add("a message") - - mock_queue_client.send_message.assert_called_once_with("a message") - - def test_items_method_receives_messages(self, mock_queue_client): - mock_queue_client.receive_messages.return_value = ["this", "that"] - - assert Queue("new-queue").items(limit=100) == ["this", "that"] - - mock_queue_client.receive_messages.assert_called_once_with(max_messages=100) - - def test_item_method_receives_messages(self, mock_queue_client): - mock_queue_client.receive_message.return_value = ["this"] - - assert Queue("new-queue").item() == ["this"] - - mock_queue_client.receive_message.assert_called_once() - - def test_delete_deletes_message(self, mock_queue_client): - Queue("new-queue").delete("this message") - - mock_queue_client.delete_message.assert_called_once_with("this message") - - def test_queue_initialises_using_managed_identity_credentials(self, monkeypatch): - monkeypatch.setenv("STORAGE_ACCOUNT_NAME", "mystorageaccount") - monkeypatch.setenv("QUEUE_MI_CLIENT_ID", "my-mi-id") - mock_mi_cred = MagicMock() - - with patch( - "manage_breast_screening.notifications.services.queue.QueueClient" - ) as queue_client: - with patch( - "manage_breast_screening.notifications.services.queue.ManagedIdentityCredential" - ) as managed_identity_constructor: - managed_identity_constructor.return_value = mock_mi_cred - - Queue("new-queue") - - queue_client.assert_called_once_with( - "https://mystorageaccount.queue.core.windows.net", - queue_name="new-queue", - credential=mock_mi_cred, - ) - managed_identity_constructor.assert_called_once_with( - client_id="my-mi-id" - ) - - def test_retry_message_batches_queue(self): - with patch( - "manage_breast_screening.notifications.services.queue.QueueClient" - ) as queue_client: - mock_client = MagicMock() - queue_client.from_connection_string.return_value = mock_client - - Queue.RetryMessageBatches().add("some data") - - queue_client.from_connection_string.assert_called_once_with( - "qqq111", "notifications-message-batch-retries" - ) - mock_client.create_queue.assert_called_once() - mock_client.send_message.assert_called_once_with("some data") - - def test_retry_queue_prefers_queue_name_from_env(self, monkeypatch): - monkeypatch.setenv("RETRY_QUEUE_NAME", "retries") - with patch( - "manage_breast_screening.notifications.services.queue.QueueClient" - ) as queue_client: - mock_client = MagicMock() - queue_client.from_connection_string.return_value = mock_client - - Queue.RetryMessageBatches() - - queue_client.from_connection_string.assert_called_once_with( - "qqq111", "retries" - ) - - def test_queue_raises_configuration_error_when_no_config_provided( - self, monkeypatch - ): - monkeypatch.delenv("QUEUE_STORAGE_CONNECTION_STRING", raising=False) - monkeypatch.delenv("STORAGE_ACCOUNT_NAME", raising=False) - monkeypatch.delenv("QUEUE_MI_CLIENT_ID", raising=False) - - with pytest.raises(QueueConfigurationError) as exc_info: - Queue("test-queue") - - assert "Queue not configured" in str(exc_info.value) - - def test_get_message_count_returns_correct_value(self, mock_queue_client, caplog): - with patch( - "manage_breast_screening.notifications.services.queue.QueueClient" - ) as queue_client: - queue_client.get_queue_properties.return_value = PropertyMock( - approximate_message_count=666 - ) - subject = Queue("A-Q") - subject.client = queue_client - - assert subject.get_message_count() == 666