Skip to content

Commit 1d0ad5c

Browse files
authored
[PRMT-601] Allow filepath to be used for demographics and without numbering logic (#771)
1 parent 81d01b1 commit 1d0ad5c

16 files changed

+1703
-696
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from enum import StrEnum
2+
3+
4+
class LloydGeorgePreProcessFormat(StrEnum):
5+
GENERAL = "general"
6+
USB = "usb"

lambdas/handlers/bulk_upload_metadata_preprocessor_handler.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1-
from services.bulk_upload_metadata_preprocessor_service import (
2-
MetadataPreprocessorService,
1+
from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat
2+
from services.bulk_upload.metadata_general_preprocessor import (
3+
MetadataGeneralPreprocessor,
4+
)
5+
from services.bulk_upload.metadata_usb_preprocessor import (
6+
MetadataUsbPreprocessorService,
37
)
48
from utils.audit_logging_setup import LoggingService
59
from utils.decorators.ensure_env_var import ensure_environment_variables
@@ -16,7 +20,11 @@
1620
@handle_lambda_exceptions
1721
def lambda_handler(event, _context):
1822
practice_directory = event.get("practiceDirectory")
23+
raw_pre_format_type = event.get(
24+
"preFormatType", LloydGeorgePreProcessFormat.GENERAL
25+
)
1926

27+
pre_processor_service = get_pre_process_service(raw_pre_format_type)
2028
if not practice_directory:
2129
logger.info(
2230
"Failed to start metadata pre-processor due to missing practice directory"
@@ -26,5 +34,20 @@ def lambda_handler(event, _context):
2634
logger.info(
2735
f"Starting metadata pre-processor for practice directory: {practice_directory}"
2836
)
29-
metadata_service = MetadataPreprocessorService(practice_directory)
37+
38+
metadata_service = pre_processor_service(practice_directory)
3039
metadata_service.process_metadata()
40+
41+
42+
def get_pre_process_service(raw_pre_format_type):
43+
try:
44+
pre_format_type = LloydGeorgePreProcessFormat(raw_pre_format_type)
45+
if pre_format_type == LloydGeorgePreProcessFormat.GENERAL:
46+
return MetadataGeneralPreprocessor
47+
elif pre_format_type == LloydGeorgePreProcessFormat.USB:
48+
return MetadataUsbPreprocessorService
49+
except ValueError:
50+
logger.warning(
51+
f"Invalid preFormatType: '{raw_pre_format_type}', defaulting to {LloydGeorgePreProcessFormat.GENERAL}."
52+
)
53+
return MetadataGeneralPreprocessor
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from services.bulk_upload_metadata_preprocessor_service import (
2+
MetadataPreprocessorService,
3+
)
4+
from utils.audit_logging_setup import LoggingService
5+
from utils.exceptions import InvalidFileNameException
6+
from utils.filename_utils import (
7+
assemble_lg_valid_file_name_full_path,
8+
extract_date_from_bulk_upload_file_name,
9+
extract_document_number_bulk_upload_file_name,
10+
extract_document_path_for_lloyd_george_record,
11+
extract_file_extension_from_bulk_upload_file_name,
12+
extract_lloyd_george_record_from_bulk_upload_file_name,
13+
extract_nhs_number_from_bulk_upload_file_name,
14+
extract_patient_name_from_bulk_upload_file_name,
15+
)
16+
17+
logger = LoggingService(__name__)
18+
19+
20+
class MetadataGeneralPreprocessor(MetadataPreprocessorService):
21+
def validate_record_filename(self, file_name: str, *args, **kwargs) -> str:
22+
try:
23+
file_path_prefix, current_file_name = (
24+
extract_document_path_for_lloyd_george_record(file_name)
25+
)
26+
first_document_number, second_document_number, current_file_name = (
27+
extract_document_number_bulk_upload_file_name(current_file_name)
28+
)
29+
current_file_name = extract_lloyd_george_record_from_bulk_upload_file_name(
30+
current_file_name
31+
)
32+
patient_name, current_file_name = (
33+
extract_patient_name_from_bulk_upload_file_name(current_file_name)
34+
)
35+
36+
if sum(c.isdigit() for c in current_file_name) != 18:
37+
logger.info("Failed to find NHS number or date")
38+
raise InvalidFileNameException("Incorrect NHS number or date format")
39+
40+
nhs_number, current_file_name = (
41+
extract_nhs_number_from_bulk_upload_file_name(current_file_name)
42+
)
43+
date, current_file_name = extract_date_from_bulk_upload_file_name(
44+
current_file_name
45+
)
46+
file_extension = extract_file_extension_from_bulk_upload_file_name(
47+
current_file_name
48+
)
49+
file_name = assemble_lg_valid_file_name_full_path(
50+
file_path_prefix,
51+
first_document_number,
52+
second_document_number,
53+
patient_name,
54+
nhs_number,
55+
date,
56+
file_extension,
57+
)
58+
logger.info(f"Finished processing, new file name is: {file_name}")
59+
return file_name
60+
61+
except InvalidFileNameException as error:
62+
logger.error(f"Failed to process {file_name} due to error: {error}")
63+
raise error
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import os
2+
from collections import defaultdict
3+
from datetime import date
4+
5+
from models.staging_metadata import NHS_NUMBER_FIELD_NAME
6+
from services.bulk_upload_metadata_preprocessor_service import (
7+
MetadataPreprocessorService,
8+
)
9+
from utils.audit_logging_setup import LoggingService
10+
from utils.exceptions import InvalidFileNameException
11+
from utils.filename_utils import (
12+
assemble_lg_valid_file_name_full_path,
13+
extract_date_from_bulk_upload_file_name,
14+
extract_document_number_bulk_upload_file_name,
15+
extract_document_path,
16+
extract_nhs_number_from_bulk_upload_file_name,
17+
extract_patient_name_from_bulk_upload_file_name,
18+
)
19+
20+
logger = LoggingService(__name__)
21+
22+
23+
class MetadataUsbPreprocessorService(MetadataPreprocessorService):
24+
def __init__(self, practice_directory: str):
25+
super().__init__(practice_directory)
26+
self.nhs_number_counts = defaultdict(int)
27+
28+
def generate_renaming_map(self, metadata_rows: list[dict]):
29+
valid_metadata_rows = []
30+
rejected_rows = []
31+
rejected_reasons = []
32+
33+
for row in metadata_rows:
34+
file_name = row.get("FILEPATH", "N/A")
35+
try:
36+
nhs_number = row.get(NHS_NUMBER_FIELD_NAME, "N/A")
37+
self._validate_file_extension(file_name)
38+
self._count_files_for_patient(nhs_number)
39+
valid_metadata_rows.append(row)
40+
41+
except InvalidFileNameException as error:
42+
rejected_rows.append(row)
43+
rejected_reasons.append({"FILEPATH": file_name, "REASON": str(error)})
44+
45+
renaming_map, super_rejected_rows, super_rejected_reasons = (
46+
super().generate_renaming_map(valid_metadata_rows)
47+
)
48+
49+
rejected_rows.extend(super_rejected_rows)
50+
rejected_reasons.extend(super_rejected_reasons)
51+
52+
return renaming_map, rejected_rows, rejected_reasons
53+
54+
def validate_record_filename(
55+
self, file_path, metadata_nhs_number=None, *args, **kwargs
56+
) -> str:
57+
self._validate_signal_file_for_patient(metadata_nhs_number)
58+
directory_path, file_name = extract_document_path(file_path)
59+
60+
self._validate_document_parts(file_path, file_name)
61+
62+
(
63+
nhs_number,
64+
patient_name,
65+
date_of_birth,
66+
) = self._extract_metadata_from_path(directory_path)
67+
68+
if nhs_number != metadata_nhs_number:
69+
logger.warning(
70+
f"File as it does not match the metadata NHS number: {file_path}"
71+
)
72+
73+
return assemble_lg_valid_file_name_full_path(
74+
file_path_prefix=directory_path + "/",
75+
first_document_number=1,
76+
second_document_number=1,
77+
patient_name=patient_name,
78+
nhs_number=nhs_number,
79+
date_object=date_of_birth,
80+
file_extension=".pdf",
81+
)
82+
83+
def _count_files_for_patient(self, nhs_number):
84+
self.nhs_number_counts[nhs_number] += 1
85+
86+
def _validate_signal_file_for_patient(self, nhs_number):
87+
if self.nhs_number_counts[nhs_number] > 1:
88+
raise InvalidFileNameException(
89+
f"More than one file is found for {nhs_number}"
90+
)
91+
92+
def _validate_file_extension(self, file_name: str) -> str:
93+
file_extension = os.path.splitext(file_name)[1]
94+
if file_extension != ".pdf":
95+
logger.info("Rejecting file as it is not a PDF")
96+
raise InvalidFileNameException(
97+
f"File extension {file_extension} is not supported"
98+
)
99+
return file_extension
100+
101+
def _validate_document_parts(self, file_path: str, file_name: str):
102+
try:
103+
numbers = extract_document_number_bulk_upload_file_name(file_name)
104+
except InvalidFileNameException:
105+
numbers = None
106+
107+
if numbers:
108+
first_document_number, total_document_number, _ = numbers
109+
if first_document_number != 1 or total_document_number != 1:
110+
logger.info(
111+
f"Rejecting file as it is part of a multi-part document: {file_path}"
112+
)
113+
raise InvalidFileNameException("Multi-part documents are not supported")
114+
115+
def _extract_metadata_from_path(self, directory_path: str) -> tuple[str, str, date]:
116+
nhs_number, remaining_path = extract_nhs_number_from_bulk_upload_file_name(
117+
directory_path
118+
)
119+
patient_name, remaining_path = extract_patient_name_from_bulk_upload_file_name(
120+
remaining_path
121+
)
122+
date_of_birth, _ = extract_date_from_bulk_upload_file_name(remaining_path)
123+
return nhs_number, patient_name, date_of_birth

0 commit comments

Comments
 (0)