Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions docs/infrastructure/metrics-and-logging.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Metrics and logging

Some processes were setup for the `notifications` code to observe logs, events and metrics

## Application Insights

The ApplicationInsightsLogging service class sets up logs to feed into Application Insights, which can then be called like so:

```python
class CommandHandler:
@contextmanager
@staticmethod
def handle(command_name):
try:
yield
ApplicationInsightsLogging().custom_event_info(
event_name=f"{command_name}Completed",
message=f"{command_name} completed successfully",
)
except Exception as e:
ApplicationInsightsLogging().exception(f"{command_name}Error: {e}")
raise CommandError(e)
```

The Application Insights resource is setup in by the `app_insights_audit` module in terraform:

```terraform
module "app_insights_audit" {
source = "../dtos-devops-templates/infrastructure/modules/app-insights"

name = module.shared_config.names.app-insights
location = var.region
resource_group_name = azurerm_resource_group.main.name
appinsights_type = "web"

log_analytics_workspace_id = module.log_analytics_workspace_audit.id

# alerts
action_group_id = var.action_group_id
enable_alerting = var.enable_alerting
}
```

## Metrics

This PR introduced a Metrics service class and command to collect information about queue sizes
<https://github.com/NHSDigital/dtos-manage-breast-screening/pull/617>

The metrics would feed through to the Application Insights resource and could be viewed in the Portal.
52 changes: 0 additions & 52 deletions infrastructure/modules/container-apps/alerts.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,55 +36,3 @@ resource "azurerm_monitor_scheduled_query_rules_alert_v2" "failure_event" {
}
}
}

# IMPORTANT:
# Enable metrics store with all dimensions: https://docs.azure.cn/en-us/azure-monitor/app/metrics-overview?tabs=standard#custom-metrics-dimensions-and-preaggregation
# currently this feature is in preview.
resource "azurerm_monitor_scheduled_query_rules_alert_v2" "queue_length_high" {
for_each = var.enable_alerting ? toset([
"notifications-message-batch-retries"
]) : []

name = "${var.app_short_name}-${each.key}-${var.environment}-queue-length-high-alert"
location = var.region
resource_group_name = azurerm_resource_group.main.name

auto_mitigation_enabled = true
description = "Alert when queue length exceeds ${var.queue_length_alert_threshold}"
display_name = "${var.app_short_name} Notifications Queue Length High Alert"
enabled = true
severity = 2
evaluation_frequency = "PT10M"
window_duration = "PT10M"
scopes = [var.app_insights_id]

criteria {
query = <<-KQL
customMetrics
| where name == "${each.key}"
| extend environment = tostring(customDimensions.environment)
| where environment == "${var.environment}"
| extend value = toreal(value)
| summarize avg_value = avg(value) by bin(timestamp, 5m)
| where avg_value > ${var.queue_length_alert_threshold}
KQL

metric_measure_column = "avg_value"
time_aggregation_method = "Average"
operator = "GreaterThan"
threshold = 0

failing_periods {
minimum_failing_periods_to_trigger_alert = 1
number_of_evaluation_periods = 1
}
}

action {
action_groups = [var.action_group_id]
}

tags = {
environment = var.environment
}
}
9 changes: 0 additions & 9 deletions infrastructure/modules/container-apps/jobs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,6 @@ locals {
job_short_name = "smk"
job_container_args = "create_reports --smoke-test"
}
collect_metrics = {
cron_expression = "*/5 * * * *"
environment_variables = {
RETRY_QUEUE_NAME = "notifications-message-batch-retries"
ENVIRONMENT = var.environment
}
job_short_name = "clm"
job_container_args = "collect_metrics"
}
}
}

Expand Down
1 change: 0 additions & 1 deletion infrastructure/modules/container-apps/storage.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ module "storage" {
private_endpoint_resource_group_name = azurerm_resource_group.main.name
private_service_connection_is_manual = false
}
queues = local.storage_queues
resource_group_name = azurerm_resource_group.main.name
}

Expand Down
5 changes: 0 additions & 5 deletions infrastructure/modules/container-apps/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,6 @@ variable "app_insights_id" {
type = string
}

variable "queue_length_alert_threshold" {
description = "If alerting is enabled, alert if storage account queues are greater than this threshold."
type = number
}

variable "enable_notifications_jobs_schedule" {
description = "Whether we apply the cron schedules for the notifications container app jobs"
Expand Down Expand Up @@ -259,7 +255,6 @@ locals {
container_access_type = "private"
}
}
storage_queues = ["notifications-message-batch-retries"]

always_allowed_paths = ["/sha", "/healthcheck"]
# If allowed_paths is not set, use the module default which allows any pattern
Expand Down
1 change: 0 additions & 1 deletion infrastructure/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,5 @@ module "container-apps" {
target_url = var.deploy_container_apps ? "${module.container-apps[0].external_url}healthcheck" : null
resource_group_name_infra = local.resource_group_name
enable_notifications_jobs_schedule = var.enable_notifications_jobs_schedule
queue_length_alert_threshold = var.queue_length_alert_threshold
min_replicas = var.min_replicas
}
6 changes: 0 additions & 6 deletions infrastructure/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -185,12 +185,6 @@ variable "run_notifications_smoke_test" {
type = bool
}

variable "queue_length_alert_threshold" {
description = "If alerting is enabled, alert if storage account queues are greater than this threshold."
type = number
default = 5
}

locals {
region = "uksouth"

Expand Down
4 changes: 0 additions & 4 deletions manage_breast_screening/config/.env.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ BASIC_AUTH_USERNAME=changeme
BASIC_AUTH_PASSWORD=changeme

# Notifications specific env vars
NOTIFICATIONS_BATCH_RETRY_LIMIT=5

API_OAUTH_API_KEY=""
API_OAUTH_API_KID=""
API_OAUTH_PRIVATE_KEY=""
Expand All @@ -41,9 +39,7 @@ NBSS_MESH_INBOX_NAME="paste-mesh-inbox-name-here"
NBSS_MESH_PASSWORD="paste-mesh-password-here"
NBSS_MESH_CERT="paste-pem-mesh-cert-here"
NBSS_MESH_PRIVATE_KEY="paste-pem-private-key-here"
QUEUE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;"
REPORTS_CONTAINER_NAME="notifications-reports"
RETRY_QUEUE_NAME="notifications-message-batch-retries"

APPLICATIONINSIGHTS_CONNECTION_STRING=""
APPLICATIONINSIGHTS_STATSBEAT_DISABLED_ALL=True
Expand Down
2 changes: 1 addition & 1 deletion manage_breast_screening/notifications/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The application is composed of a set of Django Admin commands which are run on a
The commands currently process a feed of data from the active NBSS system.
The commands store and then send appointment notifications via NHS Notify.

Storage is handled via Azure blob containers, Azure storage queues and Postgresql.
Storage is handled via Azure blob containers and Postgresql.

Appointment notifications are sent 4 weeks prior to the appointment date.
Any appointment data processed within 4 weeks of the appointment date will also be eligible for notification on the next scheduled batch.
Expand Down

This file was deleted.

50 changes: 0 additions & 50 deletions manage_breast_screening/notifications/services/metrics.py

This file was deleted.

67 changes: 0 additions & 67 deletions manage_breast_screening/notifications/services/queue.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def setup_environment(self):
self.env.set("NBSS_MESH_PRIVATE_KEY", "mesh-private-key")
self.env.set("NBSS_MESH_CA_CERT", "mesh-ca-cert")
self.env.set("BLOB_STORAGE_CONNECTION_STRING", connection_string)
self.env.set("QUEUE_STORAGE_CONNECTION_STRING", connection_string)
self.env.set("BLOB_CONTAINER_NAME", "nbss-appoinments-data")

Helpers().add_file_to_mesh_mailbox(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import os
import time
from contextlib import contextmanager

from mesh_client import MeshClient

from manage_breast_screening.notifications.services.queue import Queue


class Helpers:
def add_file_to_mesh_mailbox(self, filepath: str):
Expand Down Expand Up @@ -36,17 +32,3 @@ def azurite_connection_string(self):
"BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;"
"QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;"
)

@contextmanager
def queue_listener(self, queue: Queue, command: callable, delay=1.0):
def queue_count(q):
return sum(1 for i in q.peek())

count = queue_count(queue)

yield

while count == queue_count(queue):
time.sleep(delay)

command()
Loading