|
| 1 | +import os |
| 2 | +from collections import defaultdict |
| 3 | +from datetime import date |
| 4 | + |
| 5 | +from models.staging_metadata import NHS_NUMBER_FIELD_NAME |
| 6 | +from services.bulk_upload_metadata_preprocessor_service import ( |
| 7 | + MetadataPreprocessorService, |
| 8 | +) |
| 9 | +from utils.audit_logging_setup import LoggingService |
| 10 | +from utils.exceptions import InvalidFileNameException |
| 11 | +from utils.filename_utils import ( |
| 12 | + assemble_lg_valid_file_name_full_path, |
| 13 | + extract_date_from_bulk_upload_file_name, |
| 14 | + extract_document_number_bulk_upload_file_name, |
| 15 | + extract_document_path, |
| 16 | + extract_nhs_number_from_bulk_upload_file_name, |
| 17 | + extract_patient_name_from_bulk_upload_file_name, |
| 18 | +) |
| 19 | + |
| 20 | +logger = LoggingService(__name__) |
| 21 | + |
| 22 | + |
| 23 | +class MetadataUsbPreprocessorService(MetadataPreprocessorService): |
| 24 | + def __init__(self, practice_directory: str): |
| 25 | + super().__init__(practice_directory) |
| 26 | + self.nhs_number_counts = defaultdict(int) |
| 27 | + |
| 28 | + def generate_renaming_map(self, metadata_rows: list[dict]): |
| 29 | + valid_metadata_rows = [] |
| 30 | + rejected_rows = [] |
| 31 | + rejected_reasons = [] |
| 32 | + |
| 33 | + for row in metadata_rows: |
| 34 | + file_name = row.get("FILEPATH", "N/A") |
| 35 | + try: |
| 36 | + nhs_number = row.get(NHS_NUMBER_FIELD_NAME, "N/A") |
| 37 | + self._validate_file_extension(file_name) |
| 38 | + self._count_files_for_patient(nhs_number) |
| 39 | + valid_metadata_rows.append(row) |
| 40 | + |
| 41 | + except InvalidFileNameException as error: |
| 42 | + rejected_rows.append(row) |
| 43 | + rejected_reasons.append({"FILEPATH": file_name, "REASON": str(error)}) |
| 44 | + |
| 45 | + renaming_map, super_rejected_rows, super_rejected_reasons = ( |
| 46 | + super().generate_renaming_map(valid_metadata_rows) |
| 47 | + ) |
| 48 | + |
| 49 | + rejected_rows.extend(super_rejected_rows) |
| 50 | + rejected_reasons.extend(super_rejected_reasons) |
| 51 | + |
| 52 | + return renaming_map, rejected_rows, rejected_reasons |
| 53 | + |
| 54 | + def validate_record_filename( |
| 55 | + self, file_path, metadata_nhs_number=None, *args, **kwargs |
| 56 | + ) -> str: |
| 57 | + self._validate_signal_file_for_patient(metadata_nhs_number) |
| 58 | + directory_path, file_name = extract_document_path(file_path) |
| 59 | + |
| 60 | + self._validate_document_parts(file_path, file_name) |
| 61 | + |
| 62 | + ( |
| 63 | + nhs_number, |
| 64 | + patient_name, |
| 65 | + date_of_birth, |
| 66 | + ) = self._extract_metadata_from_path(directory_path) |
| 67 | + |
| 68 | + if nhs_number != metadata_nhs_number: |
| 69 | + logger.warning( |
| 70 | + f"File as it does not match the metadata NHS number: {file_path}" |
| 71 | + ) |
| 72 | + |
| 73 | + return assemble_lg_valid_file_name_full_path( |
| 74 | + file_path_prefix=directory_path + "/", |
| 75 | + first_document_number=1, |
| 76 | + second_document_number=1, |
| 77 | + patient_name=patient_name, |
| 78 | + nhs_number=nhs_number, |
| 79 | + date_object=date_of_birth, |
| 80 | + file_extension=".pdf", |
| 81 | + ) |
| 82 | + |
| 83 | + def _count_files_for_patient(self, nhs_number): |
| 84 | + self.nhs_number_counts[nhs_number] += 1 |
| 85 | + |
| 86 | + def _validate_signal_file_for_patient(self, nhs_number): |
| 87 | + if self.nhs_number_counts[nhs_number] > 1: |
| 88 | + raise InvalidFileNameException( |
| 89 | + f"More than one file is found for {nhs_number}" |
| 90 | + ) |
| 91 | + |
| 92 | + def _validate_file_extension(self, file_name: str) -> str: |
| 93 | + file_extension = os.path.splitext(file_name)[1] |
| 94 | + if file_extension != ".pdf": |
| 95 | + logger.info("Rejecting file as it is not a PDF") |
| 96 | + raise InvalidFileNameException( |
| 97 | + f"File extension {file_extension} is not supported" |
| 98 | + ) |
| 99 | + return file_extension |
| 100 | + |
| 101 | + def _validate_document_parts(self, file_path: str, file_name: str): |
| 102 | + try: |
| 103 | + numbers = extract_document_number_bulk_upload_file_name(file_name) |
| 104 | + except InvalidFileNameException: |
| 105 | + numbers = None |
| 106 | + |
| 107 | + if numbers: |
| 108 | + first_document_number, total_document_number, _ = numbers |
| 109 | + if first_document_number != 1 or total_document_number != 1: |
| 110 | + logger.info( |
| 111 | + f"Rejecting file as it is part of a multi-part document: {file_path}" |
| 112 | + ) |
| 113 | + raise InvalidFileNameException("Multi-part documents are not supported") |
| 114 | + |
| 115 | + def _extract_metadata_from_path(self, directory_path: str) -> tuple[str, str, date]: |
| 116 | + nhs_number, remaining_path = extract_nhs_number_from_bulk_upload_file_name( |
| 117 | + directory_path |
| 118 | + ) |
| 119 | + patient_name, remaining_path = extract_patient_name_from_bulk_upload_file_name( |
| 120 | + remaining_path |
| 121 | + ) |
| 122 | + date_of_birth, _ = extract_date_from_bulk_upload_file_name(remaining_path) |
| 123 | + return nhs_number, patient_name, date_of_birth |
0 commit comments