Skip to content

Commit f806f20

Browse files
[PRMP-541] Enforce virus scans on expedite files (#886)
1 parent 70b064c commit f806f20

File tree

6 files changed

+450
-82
lines changed

6 files changed

+450
-82
lines changed
Lines changed: 15 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,13 @@
1-
import urllib.parse
2-
31
from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat
4-
from services.bulk_upload.metadata_general_preprocessor import (
5-
MetadataGeneralPreprocessor,
6-
)
7-
from services.bulk_upload.metadata_usb_preprocessor import (
8-
MetadataUsbPreprocessorService,
9-
)
102
from services.bulk_upload_metadata_processor_service import (
113
BulkUploadMetadataProcessorService,
4+
get_formatter_service,
125
)
136
from utils.audit_logging_setup import LoggingService
147
from utils.decorators.ensure_env_var import ensure_environment_variables
158
from utils.decorators.handle_lambda_exceptions import handle_lambda_exceptions
169
from utils.decorators.override_error_check import override_error_check
1710
from utils.decorators.set_audit_arg import set_request_context_for_logging
18-
from utils.exceptions import BulkUploadMetadataException
1911

2012
logger = LoggingService(__name__)
2113

@@ -27,64 +19,33 @@
2719
)
2820
@handle_lambda_exceptions
2921
def lambda_handler(event, _context):
30-
if "source" in event and event.get("source") == "aws.s3":
31-
logger.info("Handling EventBridge event from S3")
32-
handle_expedite_event(event)
33-
return
34-
35-
practice_directory = event.get("practiceDirectory", "")
3622
raw_pre_format_type = event.get(
3723
"preFormatType", LloydGeorgePreProcessFormat.GENERAL
3824
)
3925
formatter_service_class = get_formatter_service(raw_pre_format_type)
40-
if not practice_directory:
41-
logger.error(
42-
"Failed to start metadata processing due to missing practice directory"
43-
)
44-
return
45-
46-
logger.info(
47-
f"Starting metadata processing for practice directory: {practice_directory}"
48-
)
26+
practice_directory = event.get("practiceDirectory", "")
4927

5028
remappings = event.get("metadataFieldRemappings", {})
51-
5229
metadata_formatter_service = formatter_service_class(practice_directory)
5330
metadata_service = BulkUploadMetadataProcessorService(
5431
metadata_formatter_service=metadata_formatter_service,
5532
metadata_heading_remap=remappings,
5633
)
57-
metadata_service.process_metadata()
5834

35+
if "source" in event and event.get("source") == "aws.s3":
36+
logger.info("Handling EventBridge event from S3")
37+
38+
metadata_service.handle_expedite_event(event)
39+
return
5940

60-
def get_formatter_service(raw_pre_format_type):
61-
try:
62-
pre_format_type = LloydGeorgePreProcessFormat(raw_pre_format_type)
63-
if pre_format_type == LloydGeorgePreProcessFormat.GENERAL:
64-
logger.info("Using general preFormatType")
65-
return MetadataGeneralPreprocessor
66-
elif pre_format_type == LloydGeorgePreProcessFormat.USB:
67-
logger.info("Using usb preFormatType")
68-
return MetadataUsbPreprocessorService
69-
except ValueError:
70-
logger.warning(
71-
f"Invalid preFormatType: '{raw_pre_format_type}', defaulting to {LloydGeorgePreProcessFormat.GENERAL}."
41+
if not practice_directory:
42+
logger.error(
43+
"Failed to start metadata processing due to missing practice directory"
7244
)
73-
return MetadataGeneralPreprocessor
45+
return
7446

47+
logger.info(
48+
f"Starting metadata processing for practice directory: {practice_directory}"
49+
)
7550

76-
def handle_expedite_event(event):
77-
try:
78-
key_string = event["detail"]["object"]["key"]
79-
key = urllib.parse.unquote_plus(key_string, encoding="utf-8")
80-
if key.startswith("expedite/"):
81-
logger.info("Processing file from expedite folder")
82-
return # To be added upon by ticket PRMP-540
83-
else:
84-
failure_msg = f"Unexpected directory or file location received from EventBridge: {key_string}"
85-
logger.error(failure_msg)
86-
raise BulkUploadMetadataException(failure_msg)
87-
except KeyError as e:
88-
failure_msg = f"Failed due to missing key: {str(e)}"
89-
logger.error(failure_msg)
90-
raise BulkUploadMetadataException(failure_msg)
51+
metadata_service.process_metadata()

lambdas/repositories/bulk_upload/bulk_upload_s3_repository.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
VirusScanNoResultException,
1515
)
1616

17-
_logger = LoggingService(__name__)
17+
logger = LoggingService(__name__)
1818

1919

2020
class BulkUploadS3Repository:
@@ -54,17 +54,17 @@ def check_virus_result(
5454
)
5555
except ClientError as e:
5656
if "AccessDenied" in str(e) or "NoSuchKey" in str(e):
57-
_logger.info(
57+
logger.info(
5858
f"Failed to check object tag for given file_path: {file_path}"
5959
)
60-
_logger.info(
60+
logger.info(
6161
"file_path may be incorrect or contain invalid character"
6262
)
6363
raise S3FileNotFoundException(f"Failed to access file {file_path}")
6464
else:
6565
raise e
6666

67-
_logger.info(
67+
logger.info(
6868
f"Verified that all documents for patient {staging_metadata.nhs_number} are clean."
6969
)
7070

@@ -96,3 +96,33 @@ def rollback_transaction(self):
9696

9797
def file_exists_on_staging_bucket(self, file_key: str) -> bool:
9898
return self.s3_repository.file_exist_on_s3(self.staging_bucket_name, file_key)
99+
100+
def check_file_tag_status_on_staging_bucket(self, file_key: str) -> str:
101+
"""
102+
Retrieves the virus scan tag value for a single file.
103+
Raises specific exceptions based on the tag's presence or S3 access.
104+
"""
105+
s3_service = self.s3_repository
106+
107+
try:
108+
# Call the underlying S3 method to get the tag value
109+
raw_scan_result = s3_service.get_tag_value(
110+
s3_bucket_name=self.staging_bucket_name,
111+
file_key=file_key,
112+
tag_key=SCAN_RESULT_TAG_KEY,
113+
)
114+
return raw_scan_result
115+
116+
except TagNotFoundException:
117+
return ""
118+
119+
except ClientError as e:
120+
error_msg = str(e)
121+
if "AccessDenied" in str(e) or "NoSuchKey" in error_msg:
122+
logger.error(
123+
f"Failed to check object tag for given file_path: {file_key}"
124+
)
125+
logger.error("file_path may be incorrect or contain invalid character")
126+
raise S3FileNotFoundException(f"Failed to access file {file_key}")
127+
else:
128+
raise e

lambdas/services/bulk_upload_metadata_processor_service.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
import os
33
import shutil
44
import tempfile
5+
import urllib.parse
56
from collections import defaultdict
67
from datetime import datetime
78

89
import pydantic
910
from botocore.exceptions import ClientError
11+
12+
from enums.lloyd_george_pre_process_format import LloydGeorgePreProcessFormat
1013
from enums.upload_status import UploadStatus
14+
from enums.virus_scan_result import VirusScanResult
1115
from models.staging_metadata import (
1216
METADATA_FILENAME,
1317
BulkUploadQueueMetadata,
@@ -17,8 +21,15 @@
1721
from repositories.bulk_upload.bulk_upload_dynamo_repository import (
1822
BulkUploadDynamoRepository,
1923
)
24+
from repositories.bulk_upload.bulk_upload_s3_repository import BulkUploadS3Repository
2025
from services.base.s3_service import S3Service
2126
from services.base.sqs_service import SQSService
27+
from services.bulk_upload.metadata_general_preprocessor import (
28+
MetadataGeneralPreprocessor,
29+
)
30+
from services.bulk_upload.metadata_usb_preprocessor import (
31+
MetadataUsbPreprocessorService,
32+
)
2233
from services.bulk_upload_metadata_preprocessor_service import (
2334
MetadataPreprocessorService,
2435
)
@@ -28,8 +39,10 @@
2839
BulkUploadMetadataException,
2940
InvalidFileNameException,
3041
LGInvalidFilesException,
42+
VirusScanFailedException,
3143
)
3244
from utils.lloyd_george_validator import validate_file_name
45+
from utils.utilities import get_virus_scan_service
3346

3447
logger = LoggingService(__name__)
3548
UNSUCCESSFUL = "Unsuccessful bulk upload"
@@ -47,6 +60,9 @@ def __init__(
4760
self.s3_service = S3Service()
4861
self.sqs_service = SQSService()
4962
self.dynamo_repository = BulkUploadDynamoRepository()
63+
self.s3_repo = BulkUploadS3Repository()
64+
self.virus_scan_service = get_virus_scan_service()
65+
5066
self.metadata_heading_remap = metadata_heading_remap
5167

5268
self.temp_download_dir = tempfile.mkdtemp()
@@ -245,3 +261,71 @@ def clear_temp_storage(self):
245261
shutil.rmtree(self.temp_download_dir)
246262
except FileNotFoundError:
247263
pass
264+
265+
def check_file_status(self, file_key: str):
266+
scan_result = self.s3_repo.check_file_tag_status_on_staging_bucket(file_key)
267+
if scan_result != VirusScanResult.CLEAN:
268+
logger.info(f"Found an issue with the file {file_key}.")
269+
raise VirusScanFailedException(
270+
f"Encountered an issue when scanning the file {file_key}, scan result was {scan_result}"
271+
)
272+
273+
def enforce_virus_scanner(self, file_key: str):
274+
logger.info(
275+
f"Checking virus scan result for file: {file_key} in {self.staging_bucket_name}"
276+
)
277+
278+
try:
279+
result = self.s3_repo.check_file_tag_status_on_staging_bucket(file_key)
280+
if(result != ""):
281+
logger.info("The file has been scanned before")
282+
return
283+
logger.info(f"Virus scan tag missing for {file_key}.")
284+
self.virus_scan_service.scan_file(file_ref=file_key)
285+
286+
except ClientError as e:
287+
error_message = str(e)
288+
if "NoSuchKey" in error_message or "AccessDenied" in error_message:
289+
logger.error(f"S3 access error when checking tag for {file_key}.")
290+
raise BulkUploadMetadataException(
291+
f"Failed to access S3 file {file_key} during tag check."
292+
)
293+
else:
294+
raise
295+
296+
def handle_expedite_event(self, event):
297+
try:
298+
key_string = event["detail"]["object"]["key"]
299+
key = urllib.parse.unquote_plus(key_string, encoding="utf-8")
300+
301+
if key.startswith("expedite/"):
302+
logger.info("Processing file from expedite folder")
303+
304+
self.enforce_virus_scanner(key)
305+
self.check_file_status(key)
306+
307+
return # To be added upon by ticket PRMP-540
308+
else:
309+
failure_msg = f"Unexpected directory or file location received from EventBridge: {key_string}"
310+
logger.error(failure_msg)
311+
raise BulkUploadMetadataException(failure_msg)
312+
except KeyError as e:
313+
failure_msg = f"Failed due to missing key: {str(e)}"
314+
logger.error(failure_msg)
315+
raise BulkUploadMetadataException(failure_msg)
316+
317+
318+
def get_formatter_service(raw_pre_format_type):
319+
try:
320+
pre_format_type = LloydGeorgePreProcessFormat(raw_pre_format_type)
321+
if pre_format_type == LloydGeorgePreProcessFormat.GENERAL:
322+
logger.info("Using general preFormatType")
323+
return MetadataGeneralPreprocessor
324+
elif pre_format_type == LloydGeorgePreProcessFormat.USB:
325+
logger.info("Using usb preFormatType")
326+
return MetadataUsbPreprocessorService
327+
except ValueError:
328+
logger.warning(
329+
f"Invalid preFormatType: '{raw_pre_format_type}', defaulting to {LloydGeorgePreProcessFormat.GENERAL}."
330+
)
331+
return MetadataGeneralPreprocessor

lambdas/tests/unit/handlers/test_bulk_upload_metadata_processor_handler.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ def eventbridge_event_with_s3_key(key: str):
1818
return {
1919
"source": "aws.s3",
2020
"detail": {
21-
"object":{
22-
"key": key,
23-
},
24-
}
21+
"object": {
22+
"key": key,
23+
},
24+
},
2525
}
2626

2727

@@ -41,35 +41,50 @@ def test_metadata_processor_lambda_handler_empty_event(
4141
mock_metadata_service.process_metadata.assert_not_called()
4242

4343

44+
def test_metadata_processor_lambda_handler_s3_event_triggers_expedite(
45+
set_env, context, mock_metadata_service
46+
):
47+
event = {
48+
"source": "aws.s3",
49+
"detail": {
50+
"object": {
51+
"key": "expedite/folder/file.pdf",
52+
}
53+
},
54+
}
55+
56+
lambda_handler(event, context)
57+
58+
mock_metadata_service.handle_expedite_event.assert_called_once_with(event)
59+
mock_metadata_service.process_metadata.assert_not_called()
60+
61+
4462
def test_s3_event_with_expedite_key_processes(
4563
set_env, context, mock_metadata_service, caplog
4664
):
4765
event = eventbridge_event_with_s3_key(
4866
"expedite%2F1of1_Lloyd_George_Record_[John Michael SMITH]_[1234567890]_[15-05-1990].pdf"
4967
)
50-
lambda_handler(event, context)
68+
69+
with caplog.at_level("INFO"):
70+
lambda_handler(event, context)
5171

5272
assert any(
53-
f"Handling EventBridge event from S3"
54-
in r.message
55-
for r in caplog.records
56-
)
57-
assert any(
58-
"Processing file from expedite folder" in r.message for r in caplog.records
73+
"Handling EventBridge event from S3" in r.message for r in caplog.records
5974
)
6075

76+
mock_metadata_service.handle_expedite_event.assert_called_once_with(event)
77+
mock_metadata_service.process_metadata.assert_not_called()
78+
6179

6280
def test_s3_event_with_non_expedite_key_is_rejected(
6381
set_env, context, mock_metadata_service, caplog
6482
):
6583
key_string = "uploads/1of1_Lloyd_George_Record_[John Michael SMITH]_[1234567890]_[15-05-1990].pdf"
6684
event = eventbridge_event_with_s3_key(key_string)
6785

68-
lambda_handler(event, context)
86+
with caplog.at_level("INFO"):
87+
lambda_handler(event, context)
6988

70-
assert any(
71-
f"Unexpected directory or file location received from EventBridge: {key_string}"
72-
in r.message
73-
for r in caplog.records
74-
)
89+
mock_metadata_service.handle_expedite_event.assert_called_once_with(event)
7590
mock_metadata_service.process_metadata.assert_not_called()

0 commit comments

Comments
 (0)