NHSDigital
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎scripts/delete_all_invalid_pointers.py‎
Lines changed: 150 additions & 0 deletions b/‎scripts/delete_all_invalid_pointers.py‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/dev/athena.tf‎
Lines changed: 5 additions & 0 deletions b/‎terraform/account-wide-infrastructure/dev/athena.tf‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/dev/glue.tf‎
Lines changed: 5 additions & 0 deletions b/‎terraform/account-wide-infrastructure/dev/glue.tf‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/modules/athena/athena.tf‎
Lines changed: 31 additions & 0 deletions b/‎terraform/account-wide-infrastructure/modules/athena/athena.tf‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/modules/athena/kms.tf‎
Lines changed: 7 additions & 0 deletions b/‎terraform/account-wide-infrastructure/modules/athena/kms.tf‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/modules/athena/outputs.tf‎
Lines changed: 11 additions & 0 deletions b/‎terraform/account-wide-infrastructure/modules/athena/outputs.tf‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/modules/athena/s3.tf‎
Lines changed: 52 additions & 0 deletions b/‎terraform/account-wide-infrastructure/modules/athena/s3.tf‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/modules/athena/vars.tf‎
Lines changed: 13 additions & 0 deletions b/‎terraform/account-wide-infrastructure/modules/athena/vars.tf‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎terraform/account-wide-infrastructure/modules/glue/glue.tf‎
Lines changed: 59 additions & 0 deletions b/‎terraform/account-wide-infrastructure/modules/glue/glue.tf‎
Lines changed: 59 additions & 0 deletions
@@ -42,6 +42,9 @@ override.tf.json
 *_override.tf
 *_override.tf.json
 
+# Ignore output of data object
+terraform/account-wide-infrastructure/modules/glue/files/src.zip
+
 # Include override files you do wish to add to version control using negated pattern
 #
 # !example_override.tf
 
@@ -0,0 +1,150 @@
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import boto3
+import fire
+
+from nrlf.consumer.fhir.r4.model import DocumentReference
+from nrlf.core.logger import logger
+from nrlf.core.validators import DocumentReferenceValidator
+
+dynamodb = boto3.client("dynamodb")
+paginator = dynamodb.get_paginator("scan")
+resource = boto3.resource("dynamodb")
+
+logger.setLevel("ERROR")
+
+
+def _validate_document(document: str):
+    docref = DocumentReference.model_validate_json(document)
+
+    validator = DocumentReferenceValidator()
+    result = validator.validate(data=docref)
+
+    if not result.is_valid:
+        raise RuntimeError("Failed to validate document: " + str(result.issues))
+
+
+def _find_invalid_pointers(table_name: str) -> dict[str, Any]:
+    print(f"Finding invalid pointers to delete in table {table_name}....")
+
+    params: dict[str, Any] = {
+        "TableName": table_name,
+        "PaginationConfig": {"PageSize": 50},
+    }
+
+    invalid_pointers = []
+    total_scanned_count = 0
+
+    start_time = datetime.now(tz=timezone.utc)
+
+    for page in paginator.paginate(**params):
+        for item in page["Items"]:
+            pointer_id = item.get("id", {}).get("S")
+            document = item.get("document", {}).get("S", "")
+            try:
+                _validate_document(document)
+            except Exception as exc:
+                invalid_pointers.append((pointer_id, exc))
+
+        total_scanned_count += page["ScannedCount"]
+
+        if total_scanned_count % 1000 == 0:
+            print(".", end="", flush=True)
+
+        if total_scanned_count % 100000 == 0:
+            print(f"scanned={total_scanned_count} invalid={len(invalid_pointers)}")
+
+    end_time = datetime.now(tz=timezone.utc)
+
+    print(f" Done. Found {len(invalid_pointers)} invalid pointers")
+
+    if len(invalid_pointers) > 0:
+        print("Writing invalid pointers IDs to file ./invalid_pointers.txt ...")
+        with open("invalid_pointers.txt", "w") as f:
+            for _id, err in invalid_pointers:
+                f.write(f"{_id}: {err}\n")
+
+    return {
+        "invalid_pointers": invalid_pointers,
+        "scanned_count": total_scanned_count,
+        "find-took-secs": timedelta.total_seconds(end_time - start_time),
+    }
+
+
+def _delete_pointers(table_name: str, pointers_to_delete: list[str]) -> dict[str, Any]:
+    """
+    Delete the provided pointers from the given table.
+    """
+    start_time = datetime.now(tz=timezone.utc)
+
+    print("Deleting invalid pointers...")
+    pointers_deleted = 0
+    failed_to_delete = 0
+
+    for _batch_id in range(0, len(pointers_to_delete), 25):
+        batch = [
+            {
+                "DeleteRequest": {
+                    "Key": {
+                        "pk": {"S": f"D#{pointer_id}"},
+                        "sk": {"S": f"D#{pointer_id}"},
+                    }
+                }
+            }
+            for pointer_id in pointers_to_delete[_batch_id : _batch_id + 25]
+        ]
+
+        result = dynamodb.batch_write_item(RequestItems={table_name: batch})
+
+        unprocessed_items = len(result.get("UnprocessedItems", []))
+        pointers_deleted += len(batch) - unprocessed_items
+        failed_to_delete += unprocessed_items
+        if pointers_deleted % 1000 == 0:
+            print(".", end="", flush=True)
+
+    end_time = datetime.now(tz=timezone.utc)
+
+    print(" Done")
+    return {
+        "pointers_to_delete": len(pointers_to_delete),
+        "deleted_pointers": pointers_deleted,
+        "failed_deletes": failed_to_delete,
+        "deletes-took-secs": timedelta.total_seconds(end_time - start_time),
+    }
+
+
+def _find_and_delete_invalid_pointers(table_name: str) -> dict[str, float | int]:
+    """
+    Find and delete any pointers in the given table that are invalid based on the FHIR model and NRLF validators.
+    Parameters:
+    - table_name: The name of the pointers table to find and delete pointer from.
+    """
+    find_result = _find_invalid_pointers(table_name)
+
+    if len(find_result["invalid_pointers"]) == 0:
+        return {
+            "invalid_pointers": 0,
+            "scanned_count": find_result["scanned_count"],
+            "find-took-secs": find_result["find-took-secs"],
+        }
+
+    confirmation_input = input(
+        "Would you like to delete all the invalid pointers? (yes/no): "
+    )
+    if confirmation_input != "yes":
+        print("Invalid pointers NOT deleted.")
+        find_result.pop("invalid_pointers")
+        return find_result
+
+    pointers_to_delete = [_id for _id, _ in find_result["invalid_pointers"]]
+
+    delete_result = _delete_pointers(table_name, pointers_to_delete)
+
+    find_result.pop("invalid_pointers")
+
+    return {**find_result, **delete_result}
+
+
+if __name__ == "__main__":
+    fire.Fire(_find_and_delete_invalid_pointers)
@@ -0,0 +1,5 @@
+module "dev-athena" {
+  source             = "../modules/athena"
+  name_prefix        = "nhsd-nrlf--dev"
+  target_bucket_name = module.dev-glue.target_bucket_name
+}
@@ -0,0 +1,5 @@
+module "dev-glue" {
+  source         = "../modules/glue"
+  name_prefix    = "nhsd-nrlf--dev"
+  python_version = 3
+}
@@ -0,0 +1,31 @@
+resource "aws_athena_database" "reporting-db" {
+  name = var.database
+
+  bucket = var.target_bucket_name
+
+  encryption_configuration {
+    encryption_option = "SSE_KMS"
+    kms_key           = aws_kms_key.athena.arn
+  }
+
+  force_destroy = true
+}
+
+resource "aws_athena_workgroup" "athena" {
+  name = "${var.name_prefix}-athena-wg"
+
+  configuration {
+    enforce_workgroup_configuration    = true
+    publish_cloudwatch_metrics_enabled = true
+
+    result_configuration {
+      output_location = "s3://{aws_s3_bucket.athena.bucket}/output/"
+
+      encryption_configuration {
+        encryption_option = "SSE_KMS"
+        kms_key_arn       = aws_kms_key.athena.arn
+      }
+    }
+  }
+
+}
@@ -0,0 +1,7 @@
+resource "aws_kms_key" "athena" {
+}
+
+resource "aws_kms_alias" "athena" {
+  name          = "alias/${var.name_prefix}-athena"
+  target_key_id = aws_kms_key.athena.key_id
+}
@@ -0,0 +1,11 @@
+output "workgroup" {
+  value = aws_athena_workgroup.athena
+}
+
+output "bucket" {
+  value = aws_s3_bucket.athena
+}
+
+output "database" {
+  value = aws_athena_database.reporting-db
+}
@@ -0,0 +1,52 @@
+resource "aws_s3_bucket" "athena" {
+  bucket = "${var.name_prefix}-athena"
+}
+
+resource "aws_s3_bucket_policy" "athena" {
+  bucket = "${var.name_prefix}-athena"
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Id      = "athena-policy"
+    Statement = [
+      {
+        Sid    = "HTTPSOnly"
+        Effect = "Deny"
+        Principal = {
+          "AWS" : "*"
+        }
+        Action = "s3:*"
+        Resource = [
+          aws_s3_bucket.athena.arn,
+          "${aws_s3_bucket.athena.arn}/*",
+        ]
+        Condition = {
+          Bool = {
+            "aws:SecureTransport" = "false"
+          }
+        }
+      },
+    ]
+  })
+}
+
+resource "aws_s3_bucket_public_access_block" "athena-public-access-block" {
+  bucket = aws_s3_bucket.athena.id
+
+  block_public_acls       = true
+  block_public_policy     = true
+  ignore_public_acls      = true
+  restrict_public_buckets = true
+}
+
+
+resource "aws_s3_bucket_server_side_encryption_configuration" "athena" {
+  bucket = aws_s3_bucket.athena.bucket
+  rule {
+    apply_server_side_encryption_by_default {
+      sse_algorithm     = "aws:kms"
+      kms_master_key_id = aws_kms_key.athena.arn
+    }
+  }
+
+}
@@ -0,0 +1,13 @@
+variable "database" {
+  description = "What the db will be called"
+  default     = "nrl_reporting"
+}
+
+variable "name_prefix" {
+  type        = string
+  description = "The prefix to apply to all resources in the module."
+}
+
+variable "target_bucket_name" {
+  type = string
+}
@@ -0,0 +1,59 @@
+# Create Glue Data Catalog Database
+resource "aws_glue_catalog_database" "raw_log_database" {
+  name         = "${var.name_prefix}-raw_log"
+  location_uri = "${aws_s3_bucket.source-data-bucket.id}/"
+}
+
+# Create Glue Crawler
+resource "aws_glue_crawler" "raw_log_crawler" {
+  name          = "${var.name_prefix}-raw-log-crawler"
+  database_name = aws_glue_catalog_database.raw_log_database.name
+  role          = aws_iam_role.glue_service_role.name
+  s3_target {
+    path = "${aws_s3_bucket.source-data-bucket.id}/"
+  }
+  schema_change_policy {
+    delete_behavior = "LOG"
+  }
+  configuration = jsonencode({
+    "Version" : 1.0,
+    "Grouping" : {
+      "TableGroupingPolicy" : "CombineCompatibleSchemas"
+    }
+  })
+}
+resource "aws_glue_trigger" "raw_log_trigger" {
+  name = "${var.name_prefix}-org-report-trigger"
+  type = "ON_DEMAND"
+  actions {
+    crawler_name = aws_glue_crawler.raw_log_crawler.name
+  }
+}
+
+resource "aws_glue_job" "glue_job" {
+  name              = "${var.name_prefix}-glue-job"
+  role_arn          = aws_iam_role.glue_service_role.arn
+  description       = "Transfer logs from source to bucket"
+  glue_version      = "4.0"
+  worker_type       = "G.1X"
+  timeout           = 2880
+  max_retries       = 1
+  number_of_workers = 2
+  command {
+    name            = "glueetl"
+    python_version  = var.python_version
+    script_location = "s3://${aws_s3_bucket.code-bucket.id}/main.py"
+  }
+
+  default_arguments = {
+    "--enable-auto-scaling"             = "true"
+    "--enable-continous-cloudwatch-log" = "true"
+    "--datalake-formats"                = "delta"
+    "--source-path"                     = "s3://${aws_s3_bucket.source-data-bucket.id}/" # Specify the source S3 path
+    "--destination-path"                = "s3://${aws_s3_bucket.target-data-bucket.id}/" # Specify the destination S3 path
+    "--job-name"                        = "poc-glue-job"
+    "--enable-continuous-log-filter"    = "true"
+    "--enable-metrics"                  = "true"
+    "--extra-py-files"                  = "s3://${aws_s3_bucket.code-bucket.id}/src.zip"
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,9 @@ override.tf.json`
`42`	`42`	`*_override.tf`
`43`	`43`	`*_override.tf.json`
`44`	`44`
	`45`	`+# Ignore output of data object`
	`46`	`+terraform/account-wide-infrastructure/modules/glue/files/src.zip`
	`47`	`+`
`45`	`48`	`# Include override files you do wish to add to version control using negated pattern`
`46`	`49`	`#`
`47`	`50`	`# !example_override.tf`