Skip to content

Commit 897b730

Browse files
authored
Merge pull request #865 from NHSDigital/dtoss-11923-remove-retry-queue
DTOSS-11923: remove retry queue
2 parents beb89fb + 0d9a65d commit 897b730

File tree

17 files changed

+50
-507
lines changed

17 files changed

+50
-507
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Metrics and logging
2+
3+
Some processes were setup for the `notifications` code to observe logs, events and metrics
4+
5+
## Application Insights
6+
7+
The ApplicationInsightsLogging service class sets up logs to feed into Application Insights, which can then be called like so:
8+
9+
```python
10+
class CommandHandler:
11+
@contextmanager
12+
@staticmethod
13+
def handle(command_name):
14+
try:
15+
yield
16+
ApplicationInsightsLogging().custom_event_info(
17+
event_name=f"{command_name}Completed",
18+
message=f"{command_name} completed successfully",
19+
)
20+
except Exception as e:
21+
ApplicationInsightsLogging().exception(f"{command_name}Error: {e}")
22+
raise CommandError(e)
23+
```
24+
25+
The Application Insights resource is setup in by the `app_insights_audit` module in terraform:
26+
27+
```terraform
28+
module "app_insights_audit" {
29+
source = "../dtos-devops-templates/infrastructure/modules/app-insights"
30+
31+
name = module.shared_config.names.app-insights
32+
location = var.region
33+
resource_group_name = azurerm_resource_group.main.name
34+
appinsights_type = "web"
35+
36+
log_analytics_workspace_id = module.log_analytics_workspace_audit.id
37+
38+
# alerts
39+
action_group_id = var.action_group_id
40+
enable_alerting = var.enable_alerting
41+
}
42+
```
43+
44+
## Metrics
45+
46+
This PR introduced a Metrics service class and command to collect information about queue sizes
47+
<https://github.com/NHSDigital/dtos-manage-breast-screening/pull/617>
48+
49+
The metrics would feed through to the Application Insights resource and could be viewed in the Portal.

infrastructure/modules/container-apps/alerts.tf

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -36,55 +36,3 @@ resource "azurerm_monitor_scheduled_query_rules_alert_v2" "failure_event" {
3636
}
3737
}
3838
}
39-
40-
# IMPORTANT:
41-
# Enable metrics store with all dimensions: https://docs.azure.cn/en-us/azure-monitor/app/metrics-overview?tabs=standard#custom-metrics-dimensions-and-preaggregation
42-
# currently this feature is in preview.
43-
resource "azurerm_monitor_scheduled_query_rules_alert_v2" "queue_length_high" {
44-
for_each = var.enable_alerting ? toset([
45-
"notifications-message-batch-retries"
46-
]) : []
47-
48-
name = "${var.app_short_name}-${each.key}-${var.environment}-queue-length-high-alert"
49-
location = var.region
50-
resource_group_name = azurerm_resource_group.main.name
51-
52-
auto_mitigation_enabled = true
53-
description = "Alert when queue length exceeds ${var.queue_length_alert_threshold}"
54-
display_name = "${var.app_short_name} Notifications Queue Length High Alert"
55-
enabled = true
56-
severity = 2
57-
evaluation_frequency = "PT10M"
58-
window_duration = "PT10M"
59-
scopes = [var.app_insights_id]
60-
61-
criteria {
62-
query = <<-KQL
63-
customMetrics
64-
| where name == "${each.key}"
65-
| extend environment = tostring(customDimensions.environment)
66-
| where environment == "${var.environment}"
67-
| extend value = toreal(value)
68-
| summarize avg_value = avg(value) by bin(timestamp, 5m)
69-
| where avg_value > ${var.queue_length_alert_threshold}
70-
KQL
71-
72-
metric_measure_column = "avg_value"
73-
time_aggregation_method = "Average"
74-
operator = "GreaterThan"
75-
threshold = 0
76-
77-
failing_periods {
78-
minimum_failing_periods_to_trigger_alert = 1
79-
number_of_evaluation_periods = 1
80-
}
81-
}
82-
83-
action {
84-
action_groups = [var.action_group_id]
85-
}
86-
87-
tags = {
88-
environment = var.environment
89-
}
90-
}

infrastructure/modules/container-apps/jobs.tf

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,6 @@ locals {
3333
job_short_name = "smk"
3434
job_container_args = "create_reports --smoke-test"
3535
}
36-
collect_metrics = {
37-
cron_expression = "*/5 * * * *"
38-
environment_variables = {
39-
RETRY_QUEUE_NAME = "notifications-message-batch-retries"
40-
ENVIRONMENT = var.environment
41-
}
42-
job_short_name = "clm"
43-
job_container_args = "collect_metrics"
44-
}
4536
}
4637
}
4738

infrastructure/modules/container-apps/storage.tf

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ module "storage" {
3333
private_endpoint_resource_group_name = azurerm_resource_group.main.name
3434
private_service_connection_is_manual = false
3535
}
36-
queues = local.storage_queues
3736
resource_group_name = azurerm_resource_group.main.name
3837
}
3938

infrastructure/modules/container-apps/variables.tf

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,6 @@ variable "app_insights_id" {
199199
type = string
200200
}
201201

202-
variable "queue_length_alert_threshold" {
203-
description = "If alerting is enabled, alert if storage account queues are greater than this threshold."
204-
type = number
205-
}
206202

207203
variable "enable_notifications_jobs_schedule" {
208204
description = "Whether we apply the cron schedules for the notifications container app jobs"
@@ -259,7 +255,6 @@ locals {
259255
container_access_type = "private"
260256
}
261257
}
262-
storage_queues = ["notifications-message-batch-retries"]
263258

264259
always_allowed_paths = ["/sha", "/healthcheck"]
265260
# If allowed_paths is not set, use the module default which allows any pattern

infrastructure/terraform/main.tf

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,5 @@ module "container-apps" {
7979
target_url = var.deploy_container_apps ? "${module.container-apps[0].external_url}healthcheck" : null
8080
resource_group_name_infra = local.resource_group_name
8181
enable_notifications_jobs_schedule = var.enable_notifications_jobs_schedule
82-
queue_length_alert_threshold = var.queue_length_alert_threshold
8382
min_replicas = var.min_replicas
8483
}

infrastructure/terraform/variables.tf

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,6 @@ variable "run_notifications_smoke_test" {
185185
type = bool
186186
}
187187

188-
variable "queue_length_alert_threshold" {
189-
description = "If alerting is enabled, alert if storage account queues are greater than this threshold."
190-
type = number
191-
default = 5
192-
}
193-
194188
locals {
195189
region = "uksouth"
196190

manage_breast_screening/config/.env.tpl

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ BASIC_AUTH_USERNAME=changeme
2929
BASIC_AUTH_PASSWORD=changeme
3030

3131
# Notifications specific env vars
32-
NOTIFICATIONS_BATCH_RETRY_LIMIT=5
33-
3432
API_OAUTH_API_KEY=""
3533
API_OAUTH_API_KID=""
3634
API_OAUTH_PRIVATE_KEY=""
@@ -41,9 +39,7 @@ NBSS_MESH_INBOX_NAME="paste-mesh-inbox-name-here"
4139
NBSS_MESH_PASSWORD="paste-mesh-password-here"
4240
NBSS_MESH_CERT="paste-pem-mesh-cert-here"
4341
NBSS_MESH_PRIVATE_KEY="paste-pem-private-key-here"
44-
QUEUE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;"
4542
REPORTS_CONTAINER_NAME="notifications-reports"
46-
RETRY_QUEUE_NAME="notifications-message-batch-retries"
4743

4844
APPLICATIONINSIGHTS_CONNECTION_STRING=""
4945
APPLICATIONINSIGHTS_STATSBEAT_DISABLED_ALL=True

manage_breast_screening/notifications/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ The application is composed of a set of Django Admin commands which are run on a
88
The commands currently process a feed of data from the active NBSS system.
99
The commands store and then send appointment notifications via NHS Notify.
1010

11-
Storage is handled via Azure blob containers, Azure storage queues and Postgresql.
11+
Storage is handled via Azure blob containers and Postgresql.
1212

1313
Appointment notifications are sent 4 weeks prior to the appointment date.
1414
Any appointment data processed within 4 weeks of the appointment date will also be eligible for notification on the next scheduled batch.

manage_breast_screening/notifications/management/commands/collect_metrics.py

Lines changed: 0 additions & 28 deletions
This file was deleted.

0 commit comments

Comments
 (0)