Skip to content

Commit f3c669a

Browse files
authored
[PRMP-1084] Add document review setup and processing script (#987)
1 parent 31a3c8b commit f3c669a

File tree

2 files changed

+359
-0
lines changed

2 files changed

+359
-0
lines changed

.github/workflows/base-run-bulk-upload.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,13 @@ jobs:
7979
--empty-lloydgeorge-store
8080
working-directory: ./tests/bulk-upload/scripts
8181

82+
- name: Setup Document Review
83+
run: |
84+
python setup_document_review.py
85+
working-directory: ./tests/bulk-upload/scripts
86+
env:
87+
ENVIRONMENT: "${{ inputs.sandbox }}"
88+
8289
- name: Run Bulk Upload
8390
run: |
8491
python run_bulk_upload.py \
Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
import csv
2+
import os
3+
import uuid
4+
from datetime import datetime, timedelta
5+
from typing import Any, Dict, List, NamedTuple
6+
7+
import boto3
8+
from botocore.exceptions import ClientError
9+
10+
SOURCE_PDF_FILE = "../source_to_copy_from.pdf"
11+
12+
13+
class Patient(NamedTuple):
14+
full_name: str
15+
date_of_birth: str
16+
nhs_number: str
17+
ods_code: str
18+
19+
20+
def get_timestamp(days_ago: int = 0) -> int:
21+
target_date = datetime.now() - timedelta(days=days_ago)
22+
return int(target_date.timestamp())
23+
24+
25+
def get_patients(filename: str) -> List[Dict]:
26+
patients = []
27+
csv_path = f"../test_patients_data/{filename}"
28+
29+
if not os.path.exists(csv_path):
30+
print(f"Warning: {csv_path} not found.")
31+
32+
else:
33+
with open(csv_path, mode="r", newline="") as file:
34+
reader = csv.DictReader(file)
35+
for idx, row in enumerate(reader):
36+
if idx >= 300:
37+
break
38+
patients.append(row)
39+
return patients
40+
41+
42+
def parse_patient_record(raw_record: dict) -> Patient:
43+
nhs_number = raw_record["NHS_NO"]
44+
name_parts = [
45+
raw_record["GIVEN_NAME"],
46+
raw_record.get("OTHER_GIVEN_NAME", ""),
47+
raw_record["FAMILY_NAME"],
48+
]
49+
full_name = " ".join(name_part for name_part in name_parts if name_part)
50+
date_of_birth = raw_record["DOB"].replace("/", "-")
51+
ods_code = raw_record["GPP"]
52+
return Patient(full_name, date_of_birth, nhs_number, ods_code)
53+
54+
55+
def build_document_review_object(
56+
patient: Patient,
57+
review_id: str,
58+
files: List[Dict[str, str]],
59+
review_status: str = "PENDING_REVIEW",
60+
review_reason: str = "General error",
61+
days_ago_uploaded: int = 1,
62+
reviewer: str | None = None,
63+
review_date: int | None = None,
64+
document_reference_id: str | None = None,
65+
) -> Dict[str, Any]:
66+
upload_timestamp = int(get_timestamp(days_ago=days_ago_uploaded))
67+
68+
review_obj = {
69+
"ID": review_id,
70+
"Version": 1,
71+
"Author": f"{patient.ods_code}",
72+
"Custodian": patient.ods_code,
73+
"ReviewStatus": review_status,
74+
"ReviewReason": review_reason,
75+
"UploadDate": upload_timestamp,
76+
"Files": files,
77+
"NhsNumber": patient.nhs_number,
78+
"DocumentSnomedCodeType": "16521000000101", # Lloyd George code
79+
}
80+
81+
if reviewer:
82+
review_obj["Reviewer"] = reviewer
83+
84+
if review_date:
85+
review_obj["ReviewDate"] = review_date
86+
87+
if document_reference_id:
88+
review_obj["DocumentReferenceId"] = document_reference_id
89+
90+
return review_obj
91+
92+
93+
def build_file_reference(
94+
upload_id, file_name: str, bucket_name: str
95+
) -> Dict[str, str]:
96+
s3_key = f"{upload_id}/{file_name}"
97+
file_location = f"s3://{bucket_name}/{s3_key}"
98+
99+
return {
100+
"FileName": file_name,
101+
"FileLocation": file_location,
102+
}
103+
104+
105+
def create_test_scenarios(patients: List[Patient], bucket_name: str):
106+
107+
review_objects = []
108+
files_to_upload = []
109+
110+
def scenario_1(patient):
111+
"""Pending Review with single file"""
112+
file_name = f"upload_review_{patient.nhs_number}_doc1.pdf"
113+
review_id = str(uuid.uuid4())
114+
files = [build_file_reference(review_id, file_name, bucket_name)]
115+
review_obj = build_document_review_object(
116+
review_id=review_id,
117+
patient=patient,
118+
files=files,
119+
review_status="PENDING_REVIEW",
120+
review_reason="General error",
121+
days_ago_uploaded=1,
122+
)
123+
return review_obj, [(patient.nhs_number, file_name, files[0]["FileLocation"])]
124+
125+
def scenario_2(patient):
126+
"""Pending Review with multiple files"""
127+
128+
files = []
129+
files_list = []
130+
review_id = str(uuid.uuid4())
131+
132+
for i in range(3):
133+
file_name = f"upload_review_{patient.nhs_number}_doc{i+1}.pdf"
134+
file_ref = build_file_reference(review_id, file_name, bucket_name)
135+
files.append(file_ref)
136+
files_list.append((patient.nhs_number, file_name, file_ref["FileLocation"]))
137+
review_obj = build_document_review_object(
138+
review_id=review_id,
139+
patient=patient,
140+
files=files,
141+
review_status="PENDING_REVIEW",
142+
review_reason="More or less files than we expected",
143+
days_ago_uploaded=2,
144+
)
145+
return review_obj, files_list
146+
147+
def scenario_3(patient):
148+
"""Approved review"""
149+
file_name = f"upload_review_{patient.nhs_number}_doc1.pdf"
150+
review_id = str(uuid.uuid4())
151+
files = [build_file_reference(review_id, file_name, bucket_name)]
152+
review_obj = build_document_review_object(
153+
patient=patient,
154+
review_id=review_id,
155+
files=files,
156+
review_status="APPROVED",
157+
review_reason="Demographic mismatches",
158+
days_ago_uploaded=5,
159+
reviewer="H81109",
160+
review_date=get_timestamp(days_ago=2),
161+
document_reference_id=str(uuid.uuid4()),
162+
)
163+
return review_obj, [(patient.nhs_number, file_name, files[0]["FileLocation"])]
164+
165+
def scenario_4(patient):
166+
"""Rejected review"""
167+
file_name = f"upload_review_{patient.nhs_number}_doc1.pdf"
168+
review_id = str(uuid.uuid4())
169+
files = [build_file_reference(review_id, file_name, bucket_name)]
170+
review_obj = build_document_review_object(
171+
review_id=review_id,
172+
patient=patient,
173+
files=files,
174+
review_status="REJECTED",
175+
review_reason="Filename Naming convention error",
176+
days_ago_uploaded=7,
177+
reviewer="H81109",
178+
review_date=get_timestamp(days_ago=3),
179+
)
180+
return review_obj, [(patient.nhs_number, file_name, files[0]["FileLocation"])]
181+
182+
def scenario_5(patient):
183+
"""Approved with multiple files and document reference"""
184+
185+
files = []
186+
files_list = []
187+
review_id = str(uuid.uuid4())
188+
189+
for i in range(2):
190+
file_name = f"upload_review_{patient.nhs_number}_doc{i+1}.pdf"
191+
file_ref = build_file_reference(review_id, file_name, bucket_name)
192+
files.append(file_ref)
193+
files_list.append((patient.nhs_number, file_name, file_ref["FileLocation"]))
194+
195+
review_obj = build_document_review_object(
196+
review_id=review_id,
197+
patient=patient,
198+
files=files,
199+
review_status="APPROVED",
200+
review_reason="Duplicate records error",
201+
days_ago_uploaded=10,
202+
reviewer="H81109",
203+
review_date=get_timestamp(days_ago=5),
204+
document_reference_id=str(uuid.uuid4()),
205+
)
206+
return review_obj, files_list
207+
208+
209+
def scenario_6(patient):
210+
"""random document type review"""
211+
212+
file_name = f"random_upload_{patient.nhs_number}.txt"
213+
review_id = str(uuid.uuid4())
214+
files = [build_file_reference(review_id, file_name, bucket_name)]
215+
review_obj = build_document_review_object(
216+
review_id=review_id,
217+
patient=patient,
218+
files=files,
219+
review_status="PENDING_REVIEW",
220+
review_reason="Unknown NHS number",
221+
days_ago_uploaded=3,
222+
)
223+
review_obj["DocumentSnomedCodeType"] = "734163000"
224+
return review_obj, [(patient.nhs_number, file_name, files[0]["FileLocation"])]
225+
226+
def scenario_7(patient):
227+
"""Multiple versions: NEVER_REVIEWED (v1) and PENDING_REVIEW (v2) with different custodians"""
228+
229+
review_id = str(uuid.uuid4())
230+
file_name = f"upload_review_{patient.nhs_number}_doc1.pdf"
231+
232+
files = [build_file_reference(review_id, file_name, bucket_name)]
233+
234+
review_obj_v1 = build_document_review_object(
235+
review_id=review_id,
236+
patient=patient,
237+
files=files,
238+
review_status="NEVER_REVIEWED",
239+
review_reason="General error",
240+
review_date=get_timestamp(days_ago=1),
241+
days_ago_uploaded=15,
242+
)
243+
review_obj_v1["Version"] = 1
244+
review_obj_v1["Author"] = "A12345"
245+
review_obj_v1["Reviewer"] = "A12345"
246+
247+
review_obj_v2 = build_document_review_object(
248+
review_id=review_id,
249+
patient=patient,
250+
files=files,
251+
review_status="PENDING_REVIEW",
252+
review_reason="General error",
253+
days_ago_uploaded=15,
254+
)
255+
review_obj_v2["Version"] = 2
256+
review_obj_v1["Author"] = "A12345"
257+
review_obj_v2["Custodian"] = "H81109"
258+
259+
files_list = [
260+
(patient.nhs_number, file_name, files[0]["FileLocation"])
261+
]
262+
263+
return [review_obj_v1, review_obj_v2], files_list
264+
265+
scenarios = [
266+
scenario_1,
267+
scenario_2,
268+
scenario_3,
269+
scenario_4,
270+
scenario_5,
271+
scenario_6,
272+
scenario_7,
273+
]
274+
275+
for idx, patient in enumerate(patients):
276+
scenario_func = scenarios[idx % len(scenarios)]
277+
result = scenario_func(patient)
278+
review_obj, patient_files = result
279+
280+
if isinstance(review_obj, list):
281+
review_objects.extend(review_obj)
282+
else:
283+
review_objects.append(review_obj)
284+
285+
files_to_upload.extend(patient_files)
286+
287+
return review_objects, files_to_upload
288+
289+
290+
def upload_files_to_s3(files_to_upload: List[tuple], source_pdf: str):
291+
s3_client = boto3.client("s3")
292+
293+
for nhs_number, file_name, file_location in files_to_upload:
294+
s3_location = file_location.replace("s3://", "")
295+
bucket_name, s3_key = s3_location.split("/", 1)
296+
297+
try:
298+
s3_client.upload_file(
299+
Filename=source_pdf,
300+
Bucket=bucket_name,
301+
Key=s3_key,
302+
ExtraArgs={"ContentType": "application/pdf"},
303+
)
304+
except FileNotFoundError:
305+
print(f"Source file not found: {source_pdf}")
306+
except ClientError as e:
307+
print(f"Error uploading {file_name}: {e}")
308+
309+
310+
def write_to_dynamodb(review_objects: List[Dict[str, Any]], table_name: str):
311+
dynamodb = boto3.resource("dynamodb")
312+
table = dynamodb.Table(table_name)
313+
314+
try:
315+
with table.batch_writer() as batch:
316+
for review_obj in review_objects:
317+
batch.put_item(Item=review_obj)
318+
print(f"\nSuccessfully wrote {len(review_objects)} review objects to DynamoDB")
319+
except ClientError as e:
320+
print(f"Error writing to DynamoDB: {e.response['Error']['Message']}")
321+
raise
322+
323+
324+
def main():
325+
environment = os.environ.get("ENVIRONMENT", "ndr-dev")
326+
bucket_name = f"{environment}-document-pending-review-store"
327+
table_name = f"{environment}_DocumentUploadReview"
328+
patient_file = os.environ.get("PATIENT_DATA_FILE", "ODS_Code_H81109.csv")
329+
330+
try:
331+
patients_data = get_patients(patient_file)
332+
patients = [parse_patient_record(record) for record in patients_data]
333+
print(f"Loaded {len(patients)} patients")
334+
except Exception as e:
335+
print(f"Error loading patients: {e}")
336+
return
337+
338+
review_objects, files_to_upload = create_test_scenarios(patients, bucket_name)
339+
print(f"Created {len(review_objects)} review objects with {len(files_to_upload)} files")
340+
341+
print("\nUploading files to S3...")
342+
upload_files_to_s3(files_to_upload, SOURCE_PDF_FILE)
343+
344+
print("\nWriting to DynamoDB...")
345+
write_to_dynamodb(review_objects, table_name)
346+
347+
print("SETUP COMPLETE")
348+
349+
350+
if __name__ == "__main__":
351+
main()
352+

0 commit comments

Comments
 (0)