Skip to content

Commit f3706f6

Browse files
Merge pull request #1111 from NHSDigital/feature/made14-NRL-1860-nft-run-prep
NRL-1860 Tweak prep scripts to support NFT runs
2 parents 9b24990 + fc5c8fb commit f3706f6

File tree

15 files changed

+185
-54
lines changed

15 files changed

+185
-54
lines changed

scripts/seed_nft_tables.py

Lines changed: 62 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
1+
import csv
12
from datetime import datetime, timedelta, timezone
23
from itertools import cycle
34
from math import gcd
45
from random import shuffle
5-
from typing import Any
6+
from typing import Any, Iterator
67

78
import boto3
89
import fire
910

11+
# import json
12+
import numpy as np
13+
1014
from nrlf.consumer.fhir.r4.model import DocumentReference
1115
from nrlf.core.constants import (
1216
CATEGORY_ATTRIBUTES,
@@ -145,7 +149,7 @@ def _populate_seed_table(
145149
px_with_pointers: int,
146150
pointers_per_px: float = 1.0,
147151
type_dists: dict[str, int] = DEFAULT_TYPE_DISTRIBUTIONS,
148-
custodian_dists: dict[str, int] = DEFAULT_CUSTODIAN_DISTRIBUTIONS,
152+
custodian_dists: dict[str, dict[str, int]] = DEFAULT_CUSTODIAN_DISTRIBUTIONS,
149153
):
150154
"""
151155
Seeds a table with example data for non-functional testing.
@@ -155,25 +159,41 @@ def _populate_seed_table(
155159
# set up iterations
156160
type_iter = _set_up_cyclical_iterator(type_dists)
157161
custodian_iters = _set_up_custodian_iterators(custodian_dists)
158-
count_iter = _set_up_cyclical_iterator(DEFAULT_COUNT_DISTRIBUTIONS)
162+
# count_iter = _set_up_cyclical_iterator(DEFAULT_COUNT_DISTRIBUTIONS)
163+
count_iter = _get_pointer_count_poisson_distributions(
164+
px_with_pointers, pointers_per_px
165+
)
166+
# count_iter = _get_pointer_count_negbinom_distributions(px_with_pointers, pointers_per_px)
159167
testnum_cls = TestNhsNumbersIterator()
160168
testnum_iter = iter(testnum_cls)
161169

162170
px_counter = 0
163171
doc_ref_target = int(pointers_per_px * px_with_pointers)
164172
print(
165-
f"Will upsert {doc_ref_target} test pointers for {px_with_pointers} patients."
173+
f"Will upsert ~{doc_ref_target} test pointers for {px_with_pointers} patients."
166174
)
167175
doc_ref_counter = 0
168176
batch_counter = 0
177+
unprocessed_count = 0
178+
179+
pointer_data: list[list[str]] = []
169180

170181
start_time = datetime.now(tz=timezone.utc)
171182

172-
batch_upsert_items = []
173-
while px_counter <= px_with_pointers:
183+
batch_upsert_items: list[dict[str, Any]] = []
184+
while px_counter < px_with_pointers:
174185
pointers_for_px = int(next(count_iter))
186+
175187
if batch_counter + pointers_for_px > 25 or px_counter == px_with_pointers:
176-
resource.batch_write_item(RequestItems={table_name: batch_upsert_items})
188+
response = resource.batch_write_item(
189+
RequestItems={table_name: batch_upsert_items}
190+
)
191+
192+
if response.get("UnprocessedItems"):
193+
unprocessed_count += len(
194+
response.get("UnprocessedItems").get(table_name, [])
195+
)
196+
177197
batch_upsert_items = []
178198
batch_counter = 0
179199

@@ -189,55 +209,68 @@ def _populate_seed_table(
189209
)
190210
put_req = {"PutRequest": {"Item": pointer.model_dump()}}
191211
batch_upsert_items.append(put_req)
212+
pointer_data.append(
213+
[
214+
pointer.id,
215+
pointer.type,
216+
pointer.custodian,
217+
pointer.nhs_number,
218+
]
219+
)
192220
px_counter += 1
193221

222+
if px_counter % 1000 == 0:
223+
print(".", end="", flush=True)
224+
if px_counter % 100000 == 0:
225+
print(f" {px_counter} patients processed ({doc_ref_counter} pointers).")
226+
227+
print(" Done.")
228+
194229
end_time = datetime.now(tz=timezone.utc)
195230
print(
196-
f"Created {doc_ref_counter} pointers in {timedelta.total_seconds(end_time - start_time)} seconds."
231+
f"Created {doc_ref_counter} pointers in {timedelta.total_seconds(end_time - start_time)} seconds (unprocessed: {unprocessed_count})."
197232
)
198233

234+
with open("./dist/seed-nft-pointers.csv", "w") as f:
235+
writer = csv.writer(f)
236+
writer.writerow(["pointer_id", "pointer_type", "custodian", "nhs_number"])
237+
writer.writerows(pointer_data)
238+
print(f"Pointer data saved to ./dist/seed-nft-pointers.csv") # noqa
239+
199240

200-
def _set_up_cyclical_iterator(dists: dict[str, int]) -> iter:
241+
def _set_up_cyclical_iterator(dists: dict[str, int]) -> Iterator[str]:
201242
"""
202243
Given a dict of values and their relative frequencies,
203244
returns an iterator that will cycle through a the reduced and shuffled set of values.
204245
This should result in more live-like data than e.g. creating a bulk amount of each pointer type/custodian in series.
205246
It also means each batch will contain a representative sample of the distribution.
206247
"""
207248
d = gcd(*dists.values())
208-
value_list = []
249+
value_list: list[str] = []
209250
for entry in dists:
210251
value_list.extend([entry] * (dists[entry] // d))
211252
shuffle(value_list)
212253
return cycle(value_list)
213254

214255

256+
def _get_pointer_count_poisson_distributions(
257+
num_of_patients: int, pointers_per_px: float
258+
) -> Iterator[int]:
259+
p_count_distr = np.random.poisson(lam=pointers_per_px - 1, size=num_of_patients) + 1
260+
p_count_distr = np.clip(p_count_distr, a_min=1, a_max=4)
261+
return cycle(p_count_distr)
262+
263+
215264
def _set_up_custodian_iterators(
216-
custodian_dists: dict[dict[str, int]]
217-
) -> dict[str, iter]:
218-
custodian_iters = {}
265+
custodian_dists: dict[str, dict[str, int]]
266+
) -> dict[str, Iterator[str]]:
267+
custodian_iters: dict[str, Iterator[str]] = {}
219268
for pointer_type in custodian_dists:
220269
custodian_iters[pointer_type] = _set_up_cyclical_iterator(
221270
custodian_dists[pointer_type]
222271
)
223272
return custodian_iters
224273

225274

226-
def _set_up_count_iterator(pointers_per_px: float) -> iter:
227-
"""
228-
Given a target average number of pointers per patient,
229-
generates a distribution of counts per individual patient.
230-
"""
231-
232-
extra_per_hundred = int(
233-
(pointers_per_px - 1.0) * 100
234-
) # no patients can have zero pointers
235-
counts = {}
236-
counts["3"] = extra_per_hundred // 10
237-
counts["2"] = extra_per_hundred - 2 * counts["3"]
238-
counts["1"] = 100 - counts[2] - counts[3]
239-
return _set_up_cyclical_iterator(counts)
240-
241-
242275
if __name__ == "__main__":
243276
fire.Fire(_populate_seed_table)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
output "bucket_name" {
2+
description = "Name of the metadata S3 bucket"
3+
value = aws_s3_bucket.metadata_bucket.bucket
4+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
resource "aws_s3_bucket" "metadata_bucket" {
2+
bucket = "${var.name_prefix}-metadata"
3+
force_destroy = false
4+
}
5+
6+
resource "aws_s3_bucket_policy" "metadata_bucket_policy" {
7+
bucket = aws_s3_bucket.metadata_bucket.id
8+
9+
policy = jsonencode({
10+
Version = "2012-10-17"
11+
Id = "metadata_bucket_policy"
12+
Statement = [
13+
{
14+
Sid = "HTTPSOnly"
15+
Effect = "Deny"
16+
Principal = "*"
17+
Action = "s3:*"
18+
Resource = [
19+
aws_s3_bucket.metadata_bucket.arn,
20+
"${aws_s3_bucket.metadata_bucket.arn}/*",
21+
]
22+
Condition = {
23+
Bool = {
24+
"aws:SecureTransport" = "false"
25+
}
26+
}
27+
},
28+
]
29+
})
30+
}
31+
32+
resource "aws_s3_bucket_public_access_block" "metadata_bucket_public_access_block" {
33+
bucket = aws_s3_bucket.metadata_bucket.id
34+
35+
block_public_acls = true
36+
block_public_policy = true
37+
ignore_public_acls = true
38+
restrict_public_buckets = true
39+
}
40+
41+
resource "aws_s3_bucket_server_side_encryption_configuration" "metadata_bucket" {
42+
bucket = aws_s3_bucket.metadata_bucket.bucket
43+
44+
rule {
45+
apply_server_side_encryption_by_default {
46+
sse_algorithm = "AES256"
47+
}
48+
}
49+
}
50+
51+
resource "aws_s3_bucket_versioning" "metadata_bucket" {
52+
bucket = aws_s3_bucket.metadata_bucket.id
53+
versioning_configuration {
54+
status = "Enabled"
55+
}
56+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
variable "name_prefix" {
2+
type = string
3+
description = "The prefix to apply to all resources in the module."
4+
}

terraform/account-wide-infrastructure/test/dynamodb__pointers-table.tf

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@ module "ref-pointers-table" {
3232
}
3333

3434
module "perftest-pointers-table" {
35-
source = "../modules/pointers-table"
36-
name_prefix = "nhsd-nrlf--perftest"
37-
enable_deletion_protection = true
38-
enable_pitr = true
39-
kms_deletion_window_in_days = 30
35+
source = "../modules/pointers-table"
36+
name_prefix = "nhsd-nrlf--perftest"
4037
}

terraform/account-wide-infrastructure/test/s3.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,8 @@ module "perftest-truststore-bucket" {
6464
name_prefix = "nhsd-nrlf--perftest"
6565
server_certificate_file = "../../../truststore/server/perftest.pem"
6666
}
67+
68+
module "perftest-metadata-bucket" {
69+
source = "../modules/metadata-bucket"
70+
name_prefix = "nhsd-nrlf--perftest"
71+
}

terraform/infrastructure/data.tf

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,22 @@ data "aws_iam_policy" "auth-store-read-policy" {
1717

1818
data "aws_dynamodb_table" "pointers-table" {
1919
count = var.use_shared_resources ? 1 : 0
20-
name = "${local.shared_prefix}-pointers-table"
20+
name = "${local.pointers_table_prefix}-pointers-table"
2121
}
2222

2323
data "aws_iam_policy" "pointers-table-read" {
2424
count = var.use_shared_resources ? 1 : 0
25-
name = "${local.shared_prefix}-pointers-table-read"
25+
name = "${local.pointers_table_prefix}-pointers-table-read"
2626
}
2727

2828
data "aws_iam_policy" "pointers-table-write" {
2929
count = var.use_shared_resources ? 1 : 0
30-
name = "${local.shared_prefix}-pointers-table-write"
30+
name = "${local.pointers_table_prefix}-pointers-table-write"
3131
}
3232

3333
data "aws_iam_policy" "pointers-kms-read-write" {
3434
count = var.use_shared_resources ? 1 : 0
35-
name = "${local.shared_prefix}-pointers-kms-read-write"
35+
name = "${local.pointers_table_prefix}-pointers-kms-read-write"
3636
}
3737

3838
data "external" "current-info" {
Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
account_name = "dev"
22
aws_account_name = "dev"
33

4+
dynamodb_pointers_table_prefix = "nhsd-nrlf--dev"
5+
dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--dev-sandbox"
6+
47
domain = "api.record-locator.dev.national.nhs.uk"
58
public_domain = "internal-dev.api.service.nhs.uk"
69
public_sandbox_domain = "internal-dev-sandbox.api.service.nhs.uk"
7-
log_retention_period = 90
8-
enable_reporting = false
10+
11+
log_retention_period = 90
12+
enable_reporting = false
Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
account_name = "int"
22
aws_account_name = "test"
33

4-
domain = "api.record-locator.int.national.nhs.uk"
5-
deletion_protection = true
4+
dynamodb_pointers_table_prefix = "nhsd-nrlf--int"
5+
dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--int-sandbox"
6+
deletion_protection = true
67

8+
domain = "api.record-locator.int.national.nhs.uk"
79
public_domain = "int.api.service.nhs.uk"
810
public_sandbox_domain = "sandbox.api.service.nhs.uk"
9-
log_retention_period = 90
10-
enable_reporting = true
11+
12+
log_retention_period = 90
13+
enable_reporting = true
Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
account_name = "perftest"
22
aws_account_name = "test"
33

4-
domain = "perftest.record-locator.national.nhs.uk"
5-
public_domain = "perftest.api.service.nhs.uk"
6-
deletion_protection = true
4+
dynamodb_pointers_table_prefix = "nhsd-nrlf--perftest"
5+
6+
domain = "perftest.record-locator.national.nhs.uk"
7+
public_domain = "perftest.api.service.nhs.uk"
8+
79
log_retention_period = 30
810
enable_reporting = false
911
disable_firehose_lambda_subscriptions = true

0 commit comments

Comments
 (0)