From 521338527c3ac79042690ad4f9b5d5e79089fdc4 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Wed, 12 Nov 2025 13:53:10 +0000 Subject: [PATCH 01/11] [PRMP-541] force runs virus scan --- .../bulk_upload_metadata_processor_handler.py | 29 +--- .../bulk_upload/bulk_upload_s3_repository.py | 31 ++++ .../bulk_upload_metadata_processor_service.py | 77 ++++++++- ..._bulk_upload_metadata_processor_service.py | 154 ++++++++++++++++++ .../unit/services/test_bulk_upload_service.py | 54 ++++++ 5 files changed, 322 insertions(+), 23 deletions(-) diff --git a/lambdas/handlers/bulk_upload_metadata_processor_handler.py b/lambdas/handlers/bulk_upload_metadata_processor_handler.py index 8f08d6621..f863b49c7 100644 --- a/lambdas/handlers/bulk_upload_metadata_processor_handler.py +++ b/lambdas/handlers/bulk_upload_metadata_processor_handler.py @@ -1,12 +1,8 @@ from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat -from services.bulk_upload.metadata_general_preprocessor import ( - MetadataGeneralPreprocessor, -) -from services.bulk_upload.metadata_usb_preprocessor import ( - MetadataUsbPreprocessorService, -) from services.bulk_upload_metadata_processor_service import ( BulkUploadMetadataProcessorService, + get_formatter_service, + handle_expedite_event, ) from utils.audit_logging_setup import LoggingService from utils.decorators.ensure_env_var import ensure_environment_variables @@ -24,6 +20,11 @@ ) @handle_lambda_exceptions def lambda_handler(event, _context): + if "source" in event and event.get("source") == "aws.s3": + logger.info("Handling EventBridge event from S3") + handle_expedite_event(event) + return + practice_directory = event.get("practiceDirectory", "") raw_pre_format_type = event.get( @@ -44,19 +45,3 @@ def lambda_handler(event, _context): metadata_formatter_service = formatter_service_class(practice_directory) metadata_service = BulkUploadMetadataProcessorService(metadata_formatter_service) metadata_service.process_metadata() - - -def get_formatter_service(raw_pre_format_type): - try: - pre_format_type = LloydGeorgePreProcessFormat(raw_pre_format_type) - if pre_format_type == LloydGeorgePreProcessFormat.GENERAL: - logger.info("Using general preFormatType") - return MetadataGeneralPreprocessor - elif pre_format_type == LloydGeorgePreProcessFormat.USB: - logger.info("Using usb preFormatType") - return MetadataUsbPreprocessorService - except ValueError: - logger.warning( - f"Invalid preFormatType: '{raw_pre_format_type}', defaulting to {LloydGeorgePreProcessFormat.GENERAL}." - ) - return MetadataGeneralPreprocessor diff --git a/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py b/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py index c0c341c8c..02798b15e 100644 --- a/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py +++ b/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py @@ -96,3 +96,34 @@ def rollback_transaction(self): def file_exists_on_staging_bucket(self, file_key: str) -> bool: return self.s3_repository.file_exist_on_s3(self.staging_bucket_name, file_key) + + def check_file_tag_status(self, file_key: str) -> str: + """ + Retrieves the virus scan tag value for a single file. + Raises specific exceptions based on the tag's presence or S3 access. + """ + s3_service = self.s3_repository + + try: + # Call the underlying S3 method to get the tag value + raw_scan_result = s3_service.get_tag_value( + s3_bucket_name=self.staging_bucket_name, + file_key=file_key, + tag_key=SCAN_RESULT_TAG_KEY, + ) + return raw_scan_result + + except TagNotFoundException: + raise VirusScanNoResultException( + f"Virus scan result not found for document: {file_key}" + ) + except ClientError as e: + error_msg = str(e) + if "AccessDenied" in str(e) or "NoSuchKey" in error_msg: + _logger.info( + f"Failed to check object tag for given file_path: {file_key}" + ) + _logger.info("file_path may be incorrect or contain invalid character") + raise S3FileNotFoundException(f"Failed to access file {file_key}") + else: + raise e diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index 653d4d01e..bc097020b 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -2,6 +2,7 @@ import os import shutil import tempfile +import urllib.parse import uuid from collections import defaultdict from datetime import datetime @@ -9,18 +10,27 @@ import pydantic from botocore.exceptions import ClientError +from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from enums.upload_status import UploadStatus +from models.staging_metadata import ( + StagingSqsMetadata, # Simulated dependency for check_virus_result +) from models.staging_metadata import ( METADATA_FILENAME, BulkUploadQueueMetadata, MetadataFile, - StagingSqsMetadata, ) from repositories.bulk_upload.bulk_upload_dynamo_repository import ( BulkUploadDynamoRepository, ) from services.base.s3_service import S3Service from services.base.sqs_service import SQSService +from services.bulk_upload.metadata_general_preprocessor import ( + MetadataGeneralPreprocessor, +) +from services.bulk_upload.metadata_usb_preprocessor import ( + MetadataUsbPreprocessorService, +) from services.bulk_upload_metadata_preprocessor_service import ( MetadataPreprocessorService, ) @@ -29,6 +39,7 @@ BulkUploadMetadataException, InvalidFileNameException, LGInvalidFilesException, + TagNotFoundException, ) from utils.lloyd_george_validator import validate_file_name @@ -211,3 +222,67 @@ def copy_metadata_to_dated_folder(self): def clear_temp_storage(self): logger.info("Clearing temp storage directory") shutil.rmtree(self.temp_download_dir) + + +def handle_expedite_event(event): + try: + key_string = event["detail"]["object"]["key"] + key = urllib.parse.unquote_plus(key_string, encoding="utf-8") + + if key.startswith("expedite/"): + logger.info("Processing file from expedite folder") + + enforce_virus_scanner(key) + + return # To be added upon by ticket PRMP-540 + else: + failure_msg = f"Unexpected directory or file location received from EventBridge: {key_string}" + logger.error(failure_msg) + raise BulkUploadMetadataException(failure_msg) + except KeyError as e: + failure_msg = f"Failed due to missing key: {str(e)}" + logger.error(failure_msg) + raise BulkUploadMetadataException(failure_msg) + + +def enforce_virus_scanner(self, file_key: str): + logger.info( + f"Checking virus scan result for file: {file_key} in {self.staging_bucket_name}" + ) + + try: + self.s3_repo.check_file_tag_status(file_key) + logger.info("The file has been scanned before") + return + + except TagNotFoundException: + # File has not been scanned. + logger.info(f"Virus scan tag missing for {file_key}.") + # force scan the file + self.virus_scan_service.scan_file(file_ref=file_key) + + except ClientError as e: + error_message = str(e) + if "NoSuchKey" in error_message or "AccessDenied" in error_message: + logger.error(f"S3 access error when checking tag for {file_key}.") + raise BulkUploadMetadataException( + f"Failed to access S3 file {file_key} during tag check." + ) + else: + raise + + +def get_formatter_service(raw_pre_format_type): + try: + pre_format_type = LloydGeorgePreProcessFormat(raw_pre_format_type) + if pre_format_type == LloydGeorgePreProcessFormat.GENERAL: + logger.info("Using general preFormatType") + return MetadataGeneralPreprocessor + elif pre_format_type == LloydGeorgePreProcessFormat.USB: + logger.info("Using usb preFormatType") + return MetadataUsbPreprocessorService + except ValueError: + logger.warning( + f"Invalid preFormatType: '{raw_pre_format_type}', defaulting to {LloydGeorgePreProcessFormat.GENERAL}." + ) + return MetadataGeneralPreprocessor diff --git a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py index 0f5ccd421..3c1847863 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py @@ -1,5 +1,7 @@ import os import tempfile +import urllib +import urllib.parse from collections import defaultdict from unittest.mock import call @@ -30,6 +32,7 @@ BulkUploadMetadataException, InvalidFileNameException, LGInvalidFilesException, + TagNotFoundException, ) METADATA_FILE_DIR = "tests/unit/helpers/data/bulk_upload" @@ -589,3 +592,154 @@ def test_validate_and_correct_filename_sad_path( base_metadata_file.file_path ) assert result == "corrected/path/file_corrected.pdf" + + +def test_handle_expedite_event_calls_enforce_for_expedite_key(mocker): + encoded_key = urllib.parse.quote_plus("expedite/folder/some file.pdf") + event = {"detail": {"object": {"key": encoded_key}}} + + mocked_enforce = mocker.patch(f"{SERVICE_PATH}.enforce_virus_scanner") + + from services.bulk_upload_metadata_processor_service import handle_expedite_event + + handle_expedite_event(event) + + mocked_enforce.assert_called_once_with("expedite/folder/some file.pdf") + + +def test_handle_expedite_event_raises_on_unexpected_directory(): + event = {"detail": {"object": {"key": "uploads/something.pdf"}}} + + from services.bulk_upload_metadata_processor_service import handle_expedite_event + + with pytest.raises(BulkUploadMetadataException) as excinfo: + handle_expedite_event(event) + + assert "Unexpected directory or file location received from EventBridge" in str( + excinfo.value + ) + + +def test_handle_expedite_event_raises_on_missing_key(): + event = {"detail": {"object": {}}} + + from services.bulk_upload_metadata_processor_service import handle_expedite_event + + with pytest.raises(BulkUploadMetadataException) as excinfo: + handle_expedite_event(event) + + assert "Failed due to missing key" in str(excinfo.value) + + +class dummyService: + def __init__( + self, s3_repo, virus_scan_service, staging_bucket_name="staging-bucket" + ): + self.s3_repo = s3_repo + self.virus_scan_service = virus_scan_service + self.staging_bucket_name = staging_bucket_name + + +def test_enforce_virus_scanner_returns_when_tag_present(mocker): + s3_repo = mocker.Mock() + virus_scan_service = mocker.Mock() + svc = dummyService(s3_repo, virus_scan_service) + + file_key = "expedite/folder/file.pdf" + + from services.bulk_upload_metadata_processor_service import enforce_virus_scanner + + enforce_virus_scanner(svc, file_key) + + s3_repo.check_file_tag_status.assert_called_once_with(file_key) + virus_scan_service.scan_file.assert_not_called() + + +def test_enforce_virus_scanner_triggers_scan_when_tag_missing(mocker): + s3_repo = mocker.Mock() + s3_repo.check_file_tag_status.side_effect = TagNotFoundException("no tag") + virus_scan_service = mocker.Mock() + svc = dummyService(s3_repo, virus_scan_service) + + file_key = "expedite/folder/file.pdf" + + from services.bulk_upload_metadata_processor_service import enforce_virus_scanner + + enforce_virus_scanner(svc, file_key) + + virus_scan_service.scan_file.assert_called_once_with(file_ref=file_key) + + +@pytest.mark.parametrize("fragment", ["NoSuchKey", "AccessDenied"]) +def test_enforce_virus_scanner_wraps_access_errors(fragment, mocker): + s3_repo = mocker.Mock() + virus_scan_service = mocker.Mock() + + err = ClientError({"Error": {"Code": "S3Error", "Message": fragment}}, "HeadObject") + s3_repo.check_file_tag_status.side_effect = err + + svc = dummyService(s3_repo, virus_scan_service) + file_key = "expedite/folder/file.pdf" + + from services.bulk_upload_metadata_processor_service import enforce_virus_scanner + + with pytest.raises(BulkUploadMetadataException) as excinfo: + enforce_virus_scanner(svc, file_key) + + assert f"Failed to access S3 file {file_key} during tag check." in str( + excinfo.value + ) + virus_scan_service.scan_file.assert_not_called() + + +def test_enforce_virus_scanner_reraises_other_client_errors(mocker): + s3_repo = mocker.Mock() + virus_scan_service = mocker.Mock() + + err = ClientError( + {"Error": {"Code": "ThrottlingException", "Message": "Rate exceeded"}}, + "HeadObject", + ) + s3_repo.check_file_tag_status.side_effect = err + + svc = dummyService(s3_repo, virus_scan_service) + file_key = "expedite/folder/file.pdf" + + from services.bulk_upload_metadata_processor_service import enforce_virus_scanner + + with pytest.raises(ClientError): + enforce_virus_scanner(svc, file_key) + + virus_scan_service.scan_file.assert_not_called() + + +def test_get_formatter_service_returns_general_for_general_value(): + from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat + from services.bulk_upload.metadata_general_preprocessor import ( + MetadataGeneralPreprocessor, + ) + from services.bulk_upload_metadata_processor_service import get_formatter_service + + cls = get_formatter_service(LloydGeorgePreProcessFormat.GENERAL.value) + assert cls is MetadataGeneralPreprocessor + + +def test_get_formatter_service_returns_usb_for_usb_value(): + from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat + from services.bulk_upload.metadata_usb_preprocessor import ( + MetadataUsbPreprocessorService, + ) + from services.bulk_upload_metadata_processor_service import get_formatter_service + + cls = get_formatter_service(LloydGeorgePreProcessFormat.USB.value) + assert cls is MetadataUsbPreprocessorService + + +def test_get_formatter_service_defaults_to_general_on_invalid_value(): + from services.bulk_upload.metadata_general_preprocessor import ( + MetadataGeneralPreprocessor, + ) + from services.bulk_upload_metadata_processor_service import get_formatter_service + + cls = get_formatter_service("this-is-not-valid") + assert cls is MetadataGeneralPreprocessor diff --git a/lambdas/tests/unit/services/test_bulk_upload_service.py b/lambdas/tests/unit/services/test_bulk_upload_service.py index 6436d6f15..e99025caa 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_service.py @@ -50,6 +50,7 @@ PatientRecordAlreadyExistException, PdsTooManyRequestsException, S3FileNotFoundException, + TagNotFoundException, VirusScanNoResultException, ) from utils.lloyd_george_validator import LGInvalidFilesException @@ -1097,3 +1098,56 @@ def test_patient_not_found_is_caught_and_written_to_dynamo( assert call_status == UploadStatus.FAILED assert call_reason == expected_error_message assert call_metadata == TEST_STAGING_METADATA + + +@pytest.fixture +def repo(mocker): + r = BulkUploadS3Repository.__new__(BulkUploadS3Repository) # skip __init__ + r.s3_repository = mocker.Mock() + r.staging_bucket_name = MOCK_STAGING_STORE_BUCKET + return r + + +@pytest.fixture +def file_key(): + return "expedite/folder/file.pdf" + + +def test_check_file_tag_status_returns_value_when_tag_exists(repo, file_key): + repo.s3_repository.get_tag_value.return_value = "CLEAN" + + result = repo.check_file_tag_status(file_key) + + assert result == "CLEAN" + repo.s3_repository.get_tag_value.assert_called_once_with( + s3_bucket_name=MOCK_STAGING_STORE_BUCKET, + file_key=file_key, + tag_key=SCAN_RESULT_TAG_KEY, + ) + + +def test_check_file_tag_status_raises_no_result_when_tag_missing(repo, file_key): + repo.s3_repository.get_tag_value.side_effect = TagNotFoundException("no tag") + + with pytest.raises(VirusScanNoResultException): + repo.check_file_tag_status(file_key) + + +@pytest.mark.parametrize("fragment", ["AccessDenied", "NoSuchKey"]) +def test_wraps_access_errors_as_s3_not_found(repo, file_key, fragment): + repo.s3_repository.get_tag_value.side_effect = ClientError( + {"Error": {"Code": "S3Error", "Message": fragment}}, "GetObject" + ) + + with pytest.raises(S3FileNotFoundException): + repo.check_file_tag_status(file_key) + + +def test_reraises_other_client_errors(repo, file_key): + repo.s3_repository.get_tag_value.side_effect = ClientError( + {"Error": {"Code": "ThrottlingException", "Message": "Rate exceeded"}}, + "GetObject", + ) + + with pytest.raises(ClientError): + repo.check_file_tag_status(file_key) From 97934fa7558275fada382591dcbf31ec3d22f305 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Thu, 13 Nov 2025 13:41:58 +0000 Subject: [PATCH 02/11] [PRMP-541] fixed formating issues --- .../bulk_upload_metadata_processor_handler.py | 21 ++-- .../bulk_upload_metadata_processor_service.py | 105 ++++++++-------- ..._bulk_upload_metadata_processor_service.py | 114 +++--------------- 3 files changed, 81 insertions(+), 159 deletions(-) diff --git a/lambdas/handlers/bulk_upload_metadata_processor_handler.py b/lambdas/handlers/bulk_upload_metadata_processor_handler.py index f863b49c7..592f1beee 100644 --- a/lambdas/handlers/bulk_upload_metadata_processor_handler.py +++ b/lambdas/handlers/bulk_upload_metadata_processor_handler.py @@ -2,7 +2,6 @@ from services.bulk_upload_metadata_processor_service import ( BulkUploadMetadataProcessorService, get_formatter_service, - handle_expedite_event, ) from utils.audit_logging_setup import LoggingService from utils.decorators.ensure_env_var import ensure_environment_variables @@ -20,18 +19,22 @@ ) @handle_lambda_exceptions def lambda_handler(event, _context): - if "source" in event and event.get("source") == "aws.s3": - logger.info("Handling EventBridge event from S3") - handle_expedite_event(event) - return - - practice_directory = event.get("practiceDirectory", "") - raw_pre_format_type = event.get( "preFormatType", LloydGeorgePreProcessFormat.GENERAL ) formatter_service_class = get_formatter_service(raw_pre_format_type) + practice_directory = event.get("practiceDirectory", "") + + metadata_formatter_service = formatter_service_class(practice_directory) + metadata_service = BulkUploadMetadataProcessorService(metadata_formatter_service) + + if "source" in event and event.get("source") == "aws.s3": + logger.info("Handling EventBridge event from S3") + + metadata_service.handle_expedite_event(event) + return + if not practice_directory: logger.error( "Failed to start metadata processing due to missing practice directory" @@ -42,6 +45,4 @@ def lambda_handler(event, _context): f"Starting metadata processing for practice directory: {practice_directory}" ) - metadata_formatter_service = formatter_service_class(practice_directory) - metadata_service = BulkUploadMetadataProcessorService(metadata_formatter_service) metadata_service.process_metadata() diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index bc097020b..d3bbb8eaa 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -13,7 +13,7 @@ from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from enums.upload_status import UploadStatus from models.staging_metadata import ( - StagingSqsMetadata, # Simulated dependency for check_virus_result + StagingSqsMetadata, ) from models.staging_metadata import ( METADATA_FILENAME, @@ -23,6 +23,7 @@ from repositories.bulk_upload.bulk_upload_dynamo_repository import ( BulkUploadDynamoRepository, ) +from repositories.bulk_upload.bulk_upload_s3_repository import BulkUploadS3Repository from services.base.s3_service import S3Service from services.base.sqs_service import SQSService from services.bulk_upload.metadata_general_preprocessor import ( @@ -34,6 +35,7 @@ from services.bulk_upload_metadata_preprocessor_service import ( MetadataPreprocessorService, ) +from services.virus_scan_result_service import VirusScanService from utils.audit_logging_setup import LoggingService from utils.exceptions import ( BulkUploadMetadataException, @@ -53,6 +55,9 @@ def __init__(self, metadata_formatter_service: MetadataPreprocessorService): self.sqs_service = SQSService() self.dynamo_repository = BulkUploadDynamoRepository() + self.s3_repo = BulkUploadS3Repository() + self.virus_scan_service = VirusScanService() + self.staging_bucket_name = os.getenv("STAGING_STORE_BUCKET_NAME") self.metadata_queue_url = os.getenv("METADATA_SQS_QUEUE_URL") @@ -113,7 +118,7 @@ def csv_to_sqs_metadata(self, csv_file_path: str) -> list[StagingSqsMetadata]: ) with open( - csv_file_path, mode="r", encoding="utf-8-sig", errors="replace" + csv_file_path, mode="r", encoding="utf-8-sig", errors="replace" ) as csv_file_handler: csv_reader: Iterable[dict] = csv.DictReader(csv_file_handler) for row in csv_reader: @@ -128,7 +133,7 @@ def csv_to_sqs_metadata(self, csv_file_path: str) -> list[StagingSqsMetadata]: ] def process_metadata_row( - self, row: dict, patients: dict[tuple[str, str], list[BulkUploadQueueMetadata]] + self, row: dict, patients: dict[tuple[str, str], list[BulkUploadQueueMetadata]] ) -> None: file_metadata = MetadataFile.model_validate(row) nhs_number, ods_code = self.extract_patient_info(file_metadata) @@ -144,7 +149,7 @@ def process_metadata_row( @staticmethod def convert_to_sqs_metadata( - file: MetadataFile, stored_file_name: str + file: MetadataFile, stored_file_name: str ) -> BulkUploadQueueMetadata: return BulkUploadQueueMetadata( **file.model_dump(), stored_file_name=stored_file_name @@ -156,8 +161,8 @@ def extract_patient_info(self, file_metadata: MetadataFile) -> tuple[str, str]: return nhs_number, ods_code def validate_and_correct_filename( - self, - file_metadata: MetadataFile, + self, + file_metadata: MetadataFile, ) -> str: try: validate_file_name(file_metadata.file_path.split("/")[-1]) @@ -170,10 +175,10 @@ def validate_and_correct_filename( return valid_filepath def handle_invalid_filename( - self, - file_metadata: MetadataFile, - error: InvalidFileNameException, - nhs_number: str, + self, + file_metadata: MetadataFile, + error: InvalidFileNameException, + nhs_number: str, ) -> None: logger.error( f"Failed to process {file_metadata.file_path} due to error: {error}" @@ -190,7 +195,7 @@ def handle_invalid_filename( ) def send_metadata_to_fifo_sqs( - self, staging_sqs_metadata_list: list[StagingSqsMetadata] + self, staging_sqs_metadata_list: list[StagingSqsMetadata] ) -> None: sqs_group_id = f"bulk_upload_{uuid.uuid4()}" @@ -223,53 +228,51 @@ def clear_temp_storage(self): logger.info("Clearing temp storage directory") shutil.rmtree(self.temp_download_dir) + def enforce_virus_scanner(self, file_key: str): + logger.info( + f"Checking virus scan result for file: {file_key} in {self.staging_bucket_name}" + ) -def handle_expedite_event(event): - try: - key_string = event["detail"]["object"]["key"] - key = urllib.parse.unquote_plus(key_string, encoding="utf-8") + try: + self.s3_repo.check_file_tag_status(file_key) + logger.info("The file has been scanned before") + return - if key.startswith("expedite/"): - logger.info("Processing file from expedite folder") + except TagNotFoundException: + # File has not been scanned. + logger.info(f"Virus scan tag missing for {file_key}.") + # force scan the file + self.virus_scan_service.scan_file(file_ref=file_key) - enforce_virus_scanner(key) + except ClientError as e: + error_message = str(e) + if "NoSuchKey" in error_message or "AccessDenied" in error_message: + logger.error(f"S3 access error when checking tag for {file_key}.") + raise BulkUploadMetadataException( + f"Failed to access S3 file {file_key} during tag check." + ) + else: + raise - return # To be added upon by ticket PRMP-540 - else: - failure_msg = f"Unexpected directory or file location received from EventBridge: {key_string}" - logger.error(failure_msg) - raise BulkUploadMetadataException(failure_msg) - except KeyError as e: - failure_msg = f"Failed due to missing key: {str(e)}" - logger.error(failure_msg) - raise BulkUploadMetadataException(failure_msg) + def handle_expedite_event(self, event): + try: + key_string = event["detail"]["object"]["key"] + key = urllib.parse.unquote_plus(key_string, encoding="utf-8") + if key.startswith("expedite/"): + logger.info("Processing file from expedite folder") -def enforce_virus_scanner(self, file_key: str): - logger.info( - f"Checking virus scan result for file: {file_key} in {self.staging_bucket_name}" - ) + self.enforce_virus_scanner(key) - try: - self.s3_repo.check_file_tag_status(file_key) - logger.info("The file has been scanned before") - return - - except TagNotFoundException: - # File has not been scanned. - logger.info(f"Virus scan tag missing for {file_key}.") - # force scan the file - self.virus_scan_service.scan_file(file_ref=file_key) - - except ClientError as e: - error_message = str(e) - if "NoSuchKey" in error_message or "AccessDenied" in error_message: - logger.error(f"S3 access error when checking tag for {file_key}.") - raise BulkUploadMetadataException( - f"Failed to access S3 file {file_key} during tag check." - ) - else: - raise + return # To be added upon by ticket PRMP-540 + else: + failure_msg = f"Unexpected directory or file location received from EventBridge: {key_string}" + logger.error(failure_msg) + raise BulkUploadMetadataException(failure_msg) + except KeyError as e: + failure_msg = f"Failed due to missing key: {str(e)}" + logger.error(failure_msg) + raise BulkUploadMetadataException(failure_msg) def get_formatter_service(raw_pre_format_type): diff --git a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py index 3c1847863..446087157 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py @@ -302,7 +302,7 @@ def test_download_metadata_from_s3_raise_error_when_failed_to_download( test_service.download_metadata_from_s3() -class TestMetadataPreprocessorService(MetadataPreprocessorService): +class MockTestMetadataPreprocessorService(MetadataPreprocessorService): # Renamed def validate_record_filename(self, original_filename: str, *args, **kwargs) -> str: return original_filename @@ -310,7 +310,7 @@ def validate_record_filename(self, original_filename: str, *args, **kwargs) -> s @pytest.fixture def bulk_upload_service(): return BulkUploadMetadataProcessorService( - TestMetadataPreprocessorService(practice_directory="test_practice_directory") + MockTestMetadataPreprocessorService(practice_directory="test_practice_directory") ) @@ -594,124 +594,42 @@ def test_validate_and_correct_filename_sad_path( assert result == "corrected/path/file_corrected.pdf" -def test_handle_expedite_event_calls_enforce_for_expedite_key(mocker): +def test_handle_expedite_event_calls_enforce_for_expedite_key(mocker, test_service): encoded_key = urllib.parse.quote_plus("expedite/folder/some file.pdf") event = {"detail": {"object": {"key": encoded_key}}} - mocked_enforce = mocker.patch(f"{SERVICE_PATH}.enforce_virus_scanner") + mocked_enforce = mocker.patch.object(test_service, "enforce_virus_scanner") - from services.bulk_upload_metadata_processor_service import handle_expedite_event + test_service.handle_expedite_event(event) - handle_expedite_event(event) + decoded_key = "expedite/folder/some file.pdf" + mocked_enforce.assert_called_once_with(decoded_key) - mocked_enforce.assert_called_once_with("expedite/folder/some file.pdf") - -def test_handle_expedite_event_raises_on_unexpected_directory(): +def test_handle_expedite_event_raises_on_unexpected_directory(mocker, test_service): + mocked_enforce = mocker.patch.object(test_service, "enforce_virus_scanner") event = {"detail": {"object": {"key": "uploads/something.pdf"}}} - from services.bulk_upload_metadata_processor_service import handle_expedite_event - with pytest.raises(BulkUploadMetadataException) as excinfo: - handle_expedite_event(event) + test_service.handle_expedite_event(event) assert "Unexpected directory or file location received from EventBridge" in str( excinfo.value ) + mocked_enforce.assert_not_called() -def test_handle_expedite_event_raises_on_missing_key(): - event = {"detail": {"object": {}}} - from services.bulk_upload_metadata_processor_service import handle_expedite_event +def test_handle_expedite_event_raises_on_missing_key(mocker, test_service): + mocked_enforce = mocker.patch.object(test_service, "enforce_virus_scanner") + event = {"detail": {"object": {}}} with pytest.raises(BulkUploadMetadataException) as excinfo: - handle_expedite_event(event) + test_service.handle_expedite_event(event) assert "Failed due to missing key" in str(excinfo.value) - -class dummyService: - def __init__( - self, s3_repo, virus_scan_service, staging_bucket_name="staging-bucket" - ): - self.s3_repo = s3_repo - self.virus_scan_service = virus_scan_service - self.staging_bucket_name = staging_bucket_name - - -def test_enforce_virus_scanner_returns_when_tag_present(mocker): - s3_repo = mocker.Mock() - virus_scan_service = mocker.Mock() - svc = dummyService(s3_repo, virus_scan_service) - - file_key = "expedite/folder/file.pdf" - - from services.bulk_upload_metadata_processor_service import enforce_virus_scanner - - enforce_virus_scanner(svc, file_key) - - s3_repo.check_file_tag_status.assert_called_once_with(file_key) - virus_scan_service.scan_file.assert_not_called() - - -def test_enforce_virus_scanner_triggers_scan_when_tag_missing(mocker): - s3_repo = mocker.Mock() - s3_repo.check_file_tag_status.side_effect = TagNotFoundException("no tag") - virus_scan_service = mocker.Mock() - svc = dummyService(s3_repo, virus_scan_service) - - file_key = "expedite/folder/file.pdf" - - from services.bulk_upload_metadata_processor_service import enforce_virus_scanner - - enforce_virus_scanner(svc, file_key) - - virus_scan_service.scan_file.assert_called_once_with(file_ref=file_key) - - -@pytest.mark.parametrize("fragment", ["NoSuchKey", "AccessDenied"]) -def test_enforce_virus_scanner_wraps_access_errors(fragment, mocker): - s3_repo = mocker.Mock() - virus_scan_service = mocker.Mock() - - err = ClientError({"Error": {"Code": "S3Error", "Message": fragment}}, "HeadObject") - s3_repo.check_file_tag_status.side_effect = err - - svc = dummyService(s3_repo, virus_scan_service) - file_key = "expedite/folder/file.pdf" - - from services.bulk_upload_metadata_processor_service import enforce_virus_scanner - - with pytest.raises(BulkUploadMetadataException) as excinfo: - enforce_virus_scanner(svc, file_key) - - assert f"Failed to access S3 file {file_key} during tag check." in str( - excinfo.value - ) - virus_scan_service.scan_file.assert_not_called() - - -def test_enforce_virus_scanner_reraises_other_client_errors(mocker): - s3_repo = mocker.Mock() - virus_scan_service = mocker.Mock() - - err = ClientError( - {"Error": {"Code": "ThrottlingException", "Message": "Rate exceeded"}}, - "HeadObject", - ) - s3_repo.check_file_tag_status.side_effect = err - - svc = dummyService(s3_repo, virus_scan_service) - file_key = "expedite/folder/file.pdf" - - from services.bulk_upload_metadata_processor_service import enforce_virus_scanner - - with pytest.raises(ClientError): - enforce_virus_scanner(svc, file_key) - - virus_scan_service.scan_file.assert_not_called() - + mocked_enforce.assert_not_called() def test_get_formatter_service_returns_general_for_general_value(): from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat From e3e34c83a245fa98f023c2e8d6e29bdf39b7a101 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Mon, 17 Nov 2025 08:34:23 +0000 Subject: [PATCH 03/11] [PRMP-541] catch correct exception --- .../bulk_upload_metadata_processor_service.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index d3bbb8eaa..d746c858a 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -10,16 +10,17 @@ import pydantic from botocore.exceptions import ClientError + from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from enums.upload_status import UploadStatus -from models.staging_metadata import ( - StagingSqsMetadata, -) from models.staging_metadata import ( METADATA_FILENAME, BulkUploadQueueMetadata, MetadataFile, ) +from models.staging_metadata import ( + StagingSqsMetadata, +) from repositories.bulk_upload.bulk_upload_dynamo_repository import ( BulkUploadDynamoRepository, ) @@ -41,7 +42,7 @@ BulkUploadMetadataException, InvalidFileNameException, LGInvalidFilesException, - TagNotFoundException, + VirusScanNoResultException, ) from utils.lloyd_george_validator import validate_file_name @@ -238,7 +239,7 @@ def enforce_virus_scanner(self, file_key: str): logger.info("The file has been scanned before") return - except TagNotFoundException: + except VirusScanNoResultException: # File has not been scanned. logger.info(f"Virus scan tag missing for {file_key}.") # force scan the file From 76eff557626e42e175d452722477ca844a59f6e3 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Mon, 17 Nov 2025 10:08:51 +0000 Subject: [PATCH 04/11] [PRMP-541] added the stub --- lambdas/services/bulk_upload_metadata_processor_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index d746c858a..4b588686d 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -45,6 +45,7 @@ VirusScanNoResultException, ) from utils.lloyd_george_validator import validate_file_name +from utils.utilities import get_virus_scan_service logger = LoggingService(__name__) UNSUCCESSFUL = "Unsuccessful bulk upload" @@ -57,7 +58,7 @@ def __init__(self, metadata_formatter_service: MetadataPreprocessorService): self.dynamo_repository = BulkUploadDynamoRepository() self.s3_repo = BulkUploadS3Repository() - self.virus_scan_service = VirusScanService() + self.virus_scan_service = get_virus_scan_service() self.staging_bucket_name = os.getenv("STAGING_STORE_BUCKET_NAME") self.metadata_queue_url = os.getenv("METADATA_SQS_QUEUE_URL") From 451c1cb4c4f8fa74aa6bcd33d86baef9e34d5b62 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Mon, 17 Nov 2025 14:00:21 +0000 Subject: [PATCH 05/11] [PRMP-541] merged with main and fixed tests --- .../bulk_upload_metadata_processor_handler.py | 13 ++---- ..._bulk_upload_metadata_processor_service.py | 46 ++++++++++++++++--- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/lambdas/handlers/bulk_upload_metadata_processor_handler.py b/lambdas/handlers/bulk_upload_metadata_processor_handler.py index 98eb43803..3404706ea 100644 --- a/lambdas/handlers/bulk_upload_metadata_processor_handler.py +++ b/lambdas/handlers/bulk_upload_metadata_processor_handler.py @@ -25,8 +25,12 @@ def lambda_handler(event, _context): formatter_service_class = get_formatter_service(raw_pre_format_type) practice_directory = event.get("practiceDirectory", "") + remappings = event.get("metadataFieldRemappings", {}) metadata_formatter_service = formatter_service_class(practice_directory) - metadata_service = BulkUploadMetadataProcessorService(metadata_formatter_service) + metadata_service = BulkUploadMetadataProcessorService( + metadata_formatter_service=metadata_formatter_service, + metadata_heading_remap=remappings, + ) if "source" in event and event.get("source") == "aws.s3": logger.info("Handling EventBridge event from S3") @@ -44,11 +48,4 @@ def lambda_handler(event, _context): f"Starting metadata processing for practice directory: {practice_directory}" ) - remappings = event.get("metadataFieldRemappings", {}) - - metadata_formatter_service = formatter_service_class(practice_directory) - metadata_service = BulkUploadMetadataProcessorService( - metadata_formatter_service=metadata_formatter_service, - metadata_heading_remap=remappings, - ) metadata_service.process_metadata() diff --git a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py index bcd866dc7..5ef8649f1 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py @@ -64,6 +64,12 @@ def test_service(mocker, set_env, mock_tempfile): mocker.patch( "services.bulk_upload_metadata_processor_service.BulkUploadDynamoRepository" ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.BulkUploadS3Repository" + ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.get_virus_scan_service" + ) service = BulkUploadMetadataProcessorService( metadata_formatter_service=MockMetadataPreprocessorService( @@ -304,7 +310,19 @@ def validate_record_filename(self, original_filename: str, *args, **kwargs) -> s @pytest.fixture -def bulk_upload_service(): +def bulk_upload_service(mocker, set_env, mock_tempfile): + mocker.patch("services.bulk_upload_metadata_processor_service.S3Service") + mocker.patch("services.bulk_upload_metadata_processor_service.SQSService") + mocker.patch( + "services.bulk_upload_metadata_processor_service.BulkUploadDynamoRepository" + ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.BulkUploadS3Repository" + ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.get_virus_scan_service" + ) + return BulkUploadMetadataProcessorService( metadata_formatter_service=TestMetadataPreprocessorService( practice_directory="test_practice_directory" @@ -313,6 +331,7 @@ def bulk_upload_service(): ) + def test_duplicates_csv_to_sqs_metadata(mocker, bulk_upload_service): header = "FILEPATH,PAGE COUNT,GP-PRACTICE-CODE,NHS-NO,SECTION,SUB-SECTION,SCAN-DATE,SCAN-ID,USER-ID,UPLOAD" line1 = ( @@ -699,14 +718,21 @@ def test_clear_temp_storage_handles_missing_directory(mocker, test_service): mock_rm.assert_called_once_with(test_service.temp_download_dir) -@pytest.fixture(autouse=True) +@pytest.fixture @freeze_time("2025-01-01T12:00:00") -def mock_service_remapping_mandatory_fields(mocker): +def mock_service_remapping_mandatory_fields(mocker, set_env, mock_tempfile): + # Patch out external dependencies so __init__ doesn't touch real AWS/services mocker.patch("services.bulk_upload_metadata_processor_service.S3Service") mocker.patch("services.bulk_upload_metadata_processor_service.SQSService") mocker.patch( "services.bulk_upload_metadata_processor_service.BulkUploadDynamoRepository" ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.BulkUploadS3Repository" + ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.get_virus_scan_service" + ) service = BulkUploadMetadataProcessorService( metadata_formatter_service=MockMetadataPreprocessorService( @@ -732,8 +758,8 @@ def mock_service_remapping_mandatory_fields(mocker): "process_metadata_row", wraps=service.process_metadata_row, ) - mocker.patch.object(service, "s3_service") + return service @@ -771,14 +797,21 @@ def test_remapping_mandatory_fields( assert result == expected -@pytest.fixture(autouse=True) +@pytest.fixture @freeze_time("2025-01-01T12:00:00") -def mock_service_no_remapping(mocker): +def mock_service_no_remapping(mocker, set_env, mock_tempfile): + # Patch out external dependencies so __init__ doesn't touch real AWS/services mocker.patch("services.bulk_upload_metadata_processor_service.S3Service") mocker.patch("services.bulk_upload_metadata_processor_service.SQSService") mocker.patch( "services.bulk_upload_metadata_processor_service.BulkUploadDynamoRepository" ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.BulkUploadS3Repository" + ) + mocker.patch( + "services.bulk_upload_metadata_processor_service.get_virus_scan_service" + ) service = BulkUploadMetadataProcessorService( metadata_formatter_service=MockMetadataPreprocessorService( @@ -797,7 +830,6 @@ def mock_service_no_remapping(mocker): "process_metadata_row", wraps=service.process_metadata_row, ) - mocker.patch.object(service, "s3_service") return service From 4c96b31f02064dfec996668c40e452c65f956100 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Mon, 17 Nov 2025 16:28:52 +0000 Subject: [PATCH 06/11] [PRMP-541] increased test coverage and formated code --- .../bulk_upload_metadata_processor_service.py | 18 ++--- ..._bulk_upload_metadata_processor_handler.py | 18 +++++ ..._bulk_upload_metadata_processor_service.py | 78 ++++++++++++++++++- 3 files changed, 101 insertions(+), 13 deletions(-) diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index 7df591a88..d3ac9681d 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -9,15 +9,12 @@ import pydantic from botocore.exceptions import ClientError - from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from enums.upload_status import UploadStatus from models.staging_metadata import ( METADATA_FILENAME, BulkUploadQueueMetadata, MetadataFile, -) -from models.staging_metadata import ( StagingSqsMetadata, ) from repositories.bulk_upload.bulk_upload_dynamo_repository import ( @@ -35,7 +32,6 @@ from services.bulk_upload_metadata_preprocessor_service import ( MetadataPreprocessorService, ) -from services.virus_scan_result_service import VirusScanService from services.metadata_mapping_validator_service import MetadataMappingValidatorService from utils.audit_logging_setup import LoggingService from utils.exceptions import ( @@ -174,7 +170,7 @@ def csv_to_sqs_metadata(self, csv_file_path: str) -> list[StagingSqsMetadata]: ] def process_metadata_row( - self, row: dict, patients: dict[tuple[str, str], list[BulkUploadQueueMetadata]] + self, row: dict, patients: dict[tuple[str, str], list[BulkUploadQueueMetadata]] ) -> None: """Validate individual file metadata and attach to patient group.""" file_metadata = MetadataFile.model_validate(row) @@ -191,7 +187,7 @@ def process_metadata_row( @staticmethod def convert_to_sqs_metadata( - file: MetadataFile, stored_file_name: str + file: MetadataFile, stored_file_name: str ) -> BulkUploadQueueMetadata: """Convert a MetadataFile into BulkUploadQueueMetadata.""" return BulkUploadQueueMetadata( @@ -214,10 +210,10 @@ def validate_and_correct_filename(self, file_metadata: MetadataFile) -> str: ) def handle_invalid_filename( - self, - file_metadata: MetadataFile, - error: InvalidFileNameException, - nhs_number: str, + self, + file_metadata: MetadataFile, + error: InvalidFileNameException, + nhs_number: str, ) -> None: """Handle invalid filenames by logging and storing failure in Dynamo.""" logger.error( @@ -232,7 +228,7 @@ def handle_invalid_filename( ) def send_metadata_to_fifo_sqs( - self, staging_sqs_metadata_list: list[StagingSqsMetadata] + self, staging_sqs_metadata_list: list[StagingSqsMetadata] ) -> None: """Send validated metadata entries to SQS FIFO queue.""" sqs_group_id = f"bulk_upload_{uuid.uuid4()}" diff --git a/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py b/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py index b7029f5be..266163105 100644 --- a/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py +++ b/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py @@ -28,3 +28,21 @@ def test_metadata_processor_lambda_handler_empty_event( lambda_handler({}, context) mock_metadata_service.process_metadata.assert_not_called() + + +def test_metadata_processor_lambda_handler_s3_event_triggers_expedite( + set_env, context, mock_metadata_service +): + event = { + "source": "aws.s3", + "detail": { + "object": { + "key": "expedite/folder/file.pdf", + } + }, + } + + lambda_handler(event, context) + + mock_metadata_service.handle_expedite_event.assert_called_once_with(event) + mock_metadata_service.process_metadata.assert_not_called() diff --git a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py index 5ef8649f1..b13cddfc7 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py @@ -33,7 +33,7 @@ BulkUploadMetadataException, InvalidFileNameException, LGInvalidFilesException, - TagNotFoundException, + VirusScanNoResultException, ) METADATA_FILE_DIR = "tests/unit/helpers/data/bulk_upload" @@ -331,7 +331,6 @@ def bulk_upload_service(mocker, set_env, mock_tempfile): ) - def test_duplicates_csv_to_sqs_metadata(mocker, bulk_upload_service): header = "FILEPATH,PAGE COUNT,GP-PRACTICE-CODE,NHS-NO,SECTION,SUB-SECTION,SCAN-DATE,SCAN-ID,USER-ID,UPLOAD" line1 = ( @@ -903,6 +902,7 @@ def test_handle_expedite_event_raises_on_missing_key(mocker, test_service): mocked_enforce.assert_not_called() + def test_get_formatter_service_returns_general_for_general_value(): from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from services.bulk_upload.metadata_general_preprocessor import ( @@ -933,3 +933,77 @@ def test_get_formatter_service_defaults_to_general_on_invalid_value(): cls = get_formatter_service("this-is-not-valid") assert cls is MetadataGeneralPreprocessor + + +def test_enforce_virus_scanner_happy_path_does_not_trigger_scan(mocker, test_service): + file_key = "expedite/folder/file.pdf" + + mock_check = mocker.patch.object( + test_service.s3_repo, "check_file_tag_status", return_value=None + ) + mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") + + test_service.enforce_virus_scanner(file_key) + + mock_check.assert_called_once_with(file_key) + mock_scan.assert_not_called() + + +def test_enforce_virus_scanner_triggers_scan_when_no_result(mocker, test_service): + file_key = "expedite/folder/file.pdf" + + mocker.patch.object( + test_service.s3_repo, + "check_file_tag_status", + side_effect=VirusScanNoResultException("no tag"), + ) + mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") + + test_service.enforce_virus_scanner(file_key) + + mock_scan.assert_called_once_with(file_ref=file_key) + + +def test_enforce_virus_scanner_raises_bulk_exception_on_s3_access_error( + mocker, test_service +): + file_key = "expedite/folder/file.pdf" + client_error = ClientError( + {"Error": {"Code": "403", "Message": "NoSuchKey: object not found"}}, + "GetObject", + ) + + mocker.patch.object( + test_service.s3_repo, + "check_file_tag_status", + side_effect=client_error, + ) + mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") + + with pytest.raises(BulkUploadMetadataException) as excinfo: + test_service.enforce_virus_scanner(file_key) + + assert f"Failed to access S3 file {file_key} during tag check." in str( + excinfo.value + ) + mock_scan.assert_not_called() + + +def test_enforce_virus_scanner_re_raises_unexpected_client_error(mocker, test_service): + file_key = "expedite/folder/file.pdf" + client_error = ClientError( + {"Error": {"Code": "500", "Message": "InternalError"}}, + "GetObject", + ) + + mocker.patch.object( + test_service.s3_repo, + "check_file_tag_status", + side_effect=client_error, + ) + mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") + + with pytest.raises(ClientError): + test_service.enforce_virus_scanner(file_key) + + mock_scan.assert_not_called() From 00b2c40b0fff884eed5ac31b2ad087e1a6a8377e Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Thu, 20 Nov 2025 09:28:38 +0000 Subject: [PATCH 07/11] [PRMP-541] add verification that the file was scanned successfully --- .../bulk_upload_metadata_processor_service.py | 7 ++++ ..._bulk_upload_metadata_processor_service.py | 35 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index d3ac9681d..b11696bcf 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -11,6 +11,7 @@ from botocore.exceptions import ClientError from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from enums.upload_status import UploadStatus +from enums.virus_scan_result import VirusScanResult from models.staging_metadata import ( METADATA_FILENAME, BulkUploadQueueMetadata, @@ -263,6 +264,11 @@ def clear_temp_storage(self): except FileNotFoundError: pass + def check_file_status(self, file_key: str): + scan_result = self.s3_repo.check_file_tag_status(file_key) + if scan_result != VirusScanResult.CLEAN: + logger.info(f"Found an issue with the file {file_key}.") + def enforce_virus_scanner(self, file_key: str): logger.info( f"Checking virus scan result for file: {file_key} in {self.staging_bucket_name}" @@ -298,6 +304,7 @@ def handle_expedite_event(self, event): logger.info("Processing file from expedite folder") self.enforce_virus_scanner(key) + self.check_file_status(key) return # To be added upon by ticket PRMP-540 else: diff --git a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py index b13cddfc7..efd326439 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py @@ -9,6 +9,8 @@ from botocore.exceptions import ClientError from enums.upload_status import UploadStatus from freezegun import freeze_time + +from enums.virus_scan_result import VirusScanResult from models.staging_metadata import ( METADATA_FILENAME, BulkUploadQueueMetadata, @@ -1007,3 +1009,36 @@ def test_enforce_virus_scanner_re_raises_unexpected_client_error(mocker, test_se test_service.enforce_virus_scanner(file_key) mock_scan.assert_not_called() + +def test_check_file_status_clean_does_nothing(mocker, test_service, caplog): + file_key = "expedite/folder/file.pdf" + mock_check = mocker.patch.object( + test_service.s3_repo, + "check_file_tag_status", + return_value=VirusScanResult.CLEAN, + ) + + with caplog.at_level("INFO"): + test_service.check_file_status(file_key) + + mock_check.assert_called_once_with(file_key) + assert not any( + "Found an issue with the file" in record.msg for record in caplog.records + ) + + +def test_check_file_status_logs_issue_when_not_clean(mocker, test_service, caplog): + file_key = "expedite/folder/file.pdf" + mocker.patch.object( + test_service.s3_repo, + "check_file_tag_status", + return_value=VirusScanResult.INFECTED, + ) + + with caplog.at_level("INFO"): + test_service.check_file_status(file_key) + + assert any( + f"Found an issue with the file {file_key}." in record.msg + for record in caplog.records + ) From 5a706b644db4f04c1abe49f40e29313d5866ff83 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Thu, 20 Nov 2025 10:00:17 +0000 Subject: [PATCH 08/11] [PRMP-541] merged with main --- .../bulk_upload_metadata_processor_handler.py | 3 -- ..._bulk_upload_metadata_processor_handler.py | 31 +++++++++---------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/lambdas/handlers/bulk_upload_metadata_processor_handler.py b/lambdas/handlers/bulk_upload_metadata_processor_handler.py index 9265784c7..3404706ea 100644 --- a/lambdas/handlers/bulk_upload_metadata_processor_handler.py +++ b/lambdas/handlers/bulk_upload_metadata_processor_handler.py @@ -1,5 +1,3 @@ -import urllib.parse - from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from services.bulk_upload_metadata_processor_service import ( BulkUploadMetadataProcessorService, @@ -10,7 +8,6 @@ from utils.decorators.handle_lambda_exceptions import handle_lambda_exceptions from utils.decorators.override_error_check import override_error_check from utils.decorators.set_audit_arg import set_request_context_for_logging -from utils.exceptions import BulkUploadMetadataException logger = LoggingService(__name__) diff --git a/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py b/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py index 58eacbfe9..fe69f1633 100644 --- a/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py +++ b/lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py @@ -18,10 +18,10 @@ def eventbridge_event_with_s3_key(key: str): return { "source": "aws.s3", "detail": { - "object":{ - "key": key, - }, - } + "object": { + "key": key, + }, + }, } @@ -65,17 +65,17 @@ def test_s3_event_with_expedite_key_processes( event = eventbridge_event_with_s3_key( "expedite%2F1of1_Lloyd_George_Record_[John Michael SMITH]_[1234567890]_[15-05-1990].pdf" ) - lambda_handler(event, context) + + with caplog.at_level("INFO"): + lambda_handler(event, context) assert any( - f"Handling EventBridge event from S3" - in r.message - for r in caplog.records - ) - assert any( - "Processing file from expedite folder" in r.message for r in caplog.records + "Handling EventBridge event from S3" in r.message for r in caplog.records ) + mock_metadata_service.handle_expedite_event.assert_called_once_with(event) + mock_metadata_service.process_metadata.assert_not_called() + def test_s3_event_with_non_expedite_key_is_rejected( set_env, context, mock_metadata_service, caplog @@ -83,11 +83,8 @@ def test_s3_event_with_non_expedite_key_is_rejected( key_string = "uploads/1of1_Lloyd_George_Record_[John Michael SMITH]_[1234567890]_[15-05-1990].pdf" event = eventbridge_event_with_s3_key(key_string) - lambda_handler(event, context) + with caplog.at_level("INFO"): + lambda_handler(event, context) - assert any( - f"Unexpected directory or file location received from EventBridge: {key_string}" - in r.message - for r in caplog.records - ) + mock_metadata_service.handle_expedite_event.assert_called_once_with(event) mock_metadata_service.process_metadata.assert_not_called() From 197259a9d6a54e5bcbffe0c71b280e4b3213ba5c Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Fri, 21 Nov 2025 11:23:36 +0000 Subject: [PATCH 09/11] [PRMP-541] fixed comments --- .../bulk_upload/bulk_upload_s3_repository.py | 7 +++--- .../bulk_upload_metadata_processor_service.py | 18 ++++++------- ..._bulk_upload_metadata_processor_service.py | 25 +++++++++++-------- .../unit/services/test_bulk_upload_service.py | 18 ++++++++----- 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py b/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py index 02798b15e..e5caf6b04 100644 --- a/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py +++ b/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py @@ -97,7 +97,7 @@ def rollback_transaction(self): def file_exists_on_staging_bucket(self, file_key: str) -> bool: return self.s3_repository.file_exist_on_s3(self.staging_bucket_name, file_key) - def check_file_tag_status(self, file_key: str) -> str: + def check_file_tag_status_on_staging_bucket(self, file_key: str) -> str: """ Retrieves the virus scan tag value for a single file. Raises specific exceptions based on the tag's presence or S3 access. @@ -114,9 +114,8 @@ def check_file_tag_status(self, file_key: str) -> str: return raw_scan_result except TagNotFoundException: - raise VirusScanNoResultException( - f"Virus scan result not found for document: {file_key}" - ) + return "" + except ClientError as e: error_msg = str(e) if "AccessDenied" in str(e) or "NoSuchKey" in error_msg: diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index b11696bcf..091d8bf2d 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -39,7 +39,7 @@ BulkUploadMetadataException, InvalidFileNameException, LGInvalidFilesException, - VirusScanNoResultException, + VirusScanNoResultException, VirusScanFailedException, ) from utils.lloyd_george_validator import validate_file_name from utils.utilities import get_virus_scan_service @@ -265,9 +265,12 @@ def clear_temp_storage(self): pass def check_file_status(self, file_key: str): - scan_result = self.s3_repo.check_file_tag_status(file_key) + scan_result = self.s3_repo.check_file_tag_status_on_staging_bucket(file_key) if scan_result != VirusScanResult.CLEAN: logger.info(f"Found an issue with the file {file_key}.") + raise VirusScanFailedException( + f"Encountered an issue when scanning the file {file_key}, scan result was {scan_result}" + ) def enforce_virus_scanner(self, file_key: str): logger.info( @@ -275,14 +278,11 @@ def enforce_virus_scanner(self, file_key: str): ) try: - self.s3_repo.check_file_tag_status(file_key) - logger.info("The file has been scanned before") - return - - except VirusScanNoResultException: - # File has not been scanned. + result = self.s3_repo.check_file_tag_status_on_staging_bucket(file_key) + if(result != ""): + logger.info("The file has been scanned before") + return logger.info(f"Virus scan tag missing for {file_key}.") - # force scan the file self.virus_scan_service.scan_file(file_ref=file_key) except ClientError as e: diff --git a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py index efd326439..d9a68f75b 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_metadata_processor_service.py @@ -35,7 +35,7 @@ BulkUploadMetadataException, InvalidFileNameException, LGInvalidFilesException, - VirusScanNoResultException, + VirusScanNoResultException, VirusScanFailedException, ) METADATA_FILE_DIR = "tests/unit/helpers/data/bulk_upload" @@ -872,11 +872,13 @@ def test_handle_expedite_event_calls_enforce_for_expedite_key(mocker, test_servi event = {"detail": {"object": {"key": encoded_key}}} mocked_enforce = mocker.patch.object(test_service, "enforce_virus_scanner") + mocked_check_status = mocker.patch.object(test_service, "check_file_status") test_service.handle_expedite_event(event) decoded_key = "expedite/folder/some file.pdf" mocked_enforce.assert_called_once_with(decoded_key) + mocked_check_status.assert_called_once_with(decoded_key) def test_handle_expedite_event_raises_on_unexpected_directory(mocker, test_service): @@ -941,7 +943,9 @@ def test_enforce_virus_scanner_happy_path_does_not_trigger_scan(mocker, test_ser file_key = "expedite/folder/file.pdf" mock_check = mocker.patch.object( - test_service.s3_repo, "check_file_tag_status", return_value=None + test_service.s3_repo, + "check_file_tag_status_on_staging_bucket", + return_value=VirusScanResult.CLEAN, ) mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") @@ -956,8 +960,8 @@ def test_enforce_virus_scanner_triggers_scan_when_no_result(mocker, test_service mocker.patch.object( test_service.s3_repo, - "check_file_tag_status", - side_effect=VirusScanNoResultException("no tag"), + "check_file_tag_status_on_staging_bucket", + return_value="", ) mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") @@ -977,7 +981,7 @@ def test_enforce_virus_scanner_raises_bulk_exception_on_s3_access_error( mocker.patch.object( test_service.s3_repo, - "check_file_tag_status", + "check_file_tag_status_on_staging_bucket", side_effect=client_error, ) mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") @@ -1000,7 +1004,7 @@ def test_enforce_virus_scanner_re_raises_unexpected_client_error(mocker, test_se mocker.patch.object( test_service.s3_repo, - "check_file_tag_status", + "check_file_tag_status_on_staging_bucket", side_effect=client_error, ) mock_scan = mocker.patch.object(test_service.virus_scan_service, "scan_file") @@ -1014,7 +1018,7 @@ def test_check_file_status_clean_does_nothing(mocker, test_service, caplog): file_key = "expedite/folder/file.pdf" mock_check = mocker.patch.object( test_service.s3_repo, - "check_file_tag_status", + "check_file_tag_status_on_staging_bucket", return_value=VirusScanResult.CLEAN, ) @@ -1031,14 +1035,15 @@ def test_check_file_status_logs_issue_when_not_clean(mocker, test_service, caplo file_key = "expedite/folder/file.pdf" mocker.patch.object( test_service.s3_repo, - "check_file_tag_status", + "check_file_tag_status_on_staging_bucket", return_value=VirusScanResult.INFECTED, ) with caplog.at_level("INFO"): - test_service.check_file_status(file_key) + with pytest.raises(VirusScanFailedException): + test_service.check_file_status(file_key) assert any( f"Found an issue with the file {file_key}." in record.msg for record in caplog.records - ) + ) \ No newline at end of file diff --git a/lambdas/tests/unit/services/test_bulk_upload_service.py b/lambdas/tests/unit/services/test_bulk_upload_service.py index e99025caa..f2e12b267 100644 --- a/lambdas/tests/unit/services/test_bulk_upload_service.py +++ b/lambdas/tests/unit/services/test_bulk_upload_service.py @@ -1116,7 +1116,7 @@ def file_key(): def test_check_file_tag_status_returns_value_when_tag_exists(repo, file_key): repo.s3_repository.get_tag_value.return_value = "CLEAN" - result = repo.check_file_tag_status(file_key) + result = repo.check_file_tag_status_on_staging_bucket(file_key) assert result == "CLEAN" repo.s3_repository.get_tag_value.assert_called_once_with( @@ -1126,11 +1126,17 @@ def test_check_file_tag_status_returns_value_when_tag_exists(repo, file_key): ) -def test_check_file_tag_status_raises_no_result_when_tag_missing(repo, file_key): +def test_check_file_tag_status_returns_empty_string_when_tag_missing(repo, file_key): repo.s3_repository.get_tag_value.side_effect = TagNotFoundException("no tag") - with pytest.raises(VirusScanNoResultException): - repo.check_file_tag_status(file_key) + result = repo.check_file_tag_status_on_staging_bucket(file_key) + + assert result == "" + repo.s3_repository.get_tag_value.assert_called_once_with( + s3_bucket_name=MOCK_STAGING_STORE_BUCKET, + file_key=file_key, + tag_key=SCAN_RESULT_TAG_KEY, + ) @pytest.mark.parametrize("fragment", ["AccessDenied", "NoSuchKey"]) @@ -1140,7 +1146,7 @@ def test_wraps_access_errors_as_s3_not_found(repo, file_key, fragment): ) with pytest.raises(S3FileNotFoundException): - repo.check_file_tag_status(file_key) + repo.check_file_tag_status_on_staging_bucket(file_key) def test_reraises_other_client_errors(repo, file_key): @@ -1150,4 +1156,4 @@ def test_reraises_other_client_errors(repo, file_key): ) with pytest.raises(ClientError): - repo.check_file_tag_status(file_key) + repo.check_file_tag_status_on_staging_bucket(file_key) From 3955a02631ac99dfc4abe0e29f2283d492d54ac9 Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Fri, 21 Nov 2025 11:39:06 +0000 Subject: [PATCH 10/11] [PRMP-541] fixed comments --- .../bulk_upload/bulk_upload_s3_repository.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py b/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py index e5caf6b04..8c7411df8 100644 --- a/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py +++ b/lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py @@ -14,7 +14,7 @@ VirusScanNoResultException, ) -_logger = LoggingService(__name__) +logger = LoggingService(__name__) class BulkUploadS3Repository: @@ -54,17 +54,17 @@ def check_virus_result( ) except ClientError as e: if "AccessDenied" in str(e) or "NoSuchKey" in str(e): - _logger.info( + logger.info( f"Failed to check object tag for given file_path: {file_path}" ) - _logger.info( + logger.info( "file_path may be incorrect or contain invalid character" ) raise S3FileNotFoundException(f"Failed to access file {file_path}") else: raise e - _logger.info( + logger.info( f"Verified that all documents for patient {staging_metadata.nhs_number} are clean." ) @@ -119,10 +119,10 @@ def check_file_tag_status_on_staging_bucket(self, file_key: str) -> str: except ClientError as e: error_msg = str(e) if "AccessDenied" in str(e) or "NoSuchKey" in error_msg: - _logger.info( + logger.error( f"Failed to check object tag for given file_path: {file_key}" ) - _logger.info("file_path may be incorrect or contain invalid character") + logger.error("file_path may be incorrect or contain invalid character") raise S3FileNotFoundException(f"Failed to access file {file_key}") else: raise e From 0072648511e5d1f4b8ef3df786528e527a78b86b Mon Sep 17 00:00:00 2001 From: PedroSoaresNHS Date: Thu, 27 Nov 2025 08:24:40 +0000 Subject: [PATCH 11/11] [PRMP-541] fixed imports --- lambdas/services/bulk_upload_metadata_processor_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/services/bulk_upload_metadata_processor_service.py b/lambdas/services/bulk_upload_metadata_processor_service.py index 59d12cefd..b20b12d33 100644 --- a/lambdas/services/bulk_upload_metadata_processor_service.py +++ b/lambdas/services/bulk_upload_metadata_processor_service.py @@ -3,12 +3,12 @@ import shutil import tempfile import urllib.parse -import uuid from collections import defaultdict from datetime import datetime import pydantic from botocore.exceptions import ClientError + from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat from enums.upload_status import UploadStatus from enums.virus_scan_result import VirusScanResult @@ -39,7 +39,7 @@ BulkUploadMetadataException, InvalidFileNameException, LGInvalidFilesException, - VirusScanNoResultException, VirusScanFailedException, + VirusScanFailedException, ) from utils.lloyd_george_validator import validate_file_name from utils.utilities import get_virus_scan_service