Skip to content

Commit 8ea9751

Browse files
dlzhry2nhsmfjarvis
authored andcommitted
VED-718 Use ms precision in filename processor and minor refactor (#741)
1 parent bc826da commit 8ea9751

File tree

7 files changed

+81
-53
lines changed

7 files changed

+81
-53
lines changed

filenameprocessor/src/constants.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""Constants for the filenameprocessor lambda"""
22

33
import os
4+
from enum import StrEnum
5+
46
from errors import (
57
VaccineTypePermissionsError,
68
InvalidFileKeyError,
@@ -15,6 +17,8 @@
1517
AUDIT_TABLE_NAME = os.getenv("AUDIT_TABLE_NAME")
1618
AUDIT_TABLE_QUEUE_NAME_GSI = "queue_name_index"
1719
AUDIT_TABLE_FILENAME_GSI = "filename_index"
20+
DATA_SOURCES_BUCKET_SUFFIX = "data-sources"
21+
VALID_VERSIONS = ["V5"]
1822

1923
SUPPLIER_PERMISSIONS_HASH_KEY = "supplier_permissions"
2024
VACCINE_TYPE_TO_DISEASES_HASH_KEY = "vacc_to_diseases"
@@ -31,7 +35,7 @@
3135
}
3236

3337

34-
class FileStatus:
38+
class FileStatus(StrEnum):
3539
"""File status constants"""
3640

3741
QUEUED = "Queued"
@@ -40,16 +44,11 @@ class FileStatus:
4044
DUPLICATE = "Not processed - duplicate"
4145

4246

43-
class AuditTableKeys:
47+
class AuditTableKeys(StrEnum):
4448
"""Audit table keys"""
4549

4650
FILENAME = "filename"
4751
MESSAGE_ID = "message_id"
4852
QUEUE_NAME = "queue_name"
4953
STATUS = "status"
5054
TIMESTAMP = "timestamp"
51-
52-
53-
class Constants:
54-
"""Constants for the filenameprocessor lambda"""
55-
VALID_VERSIONS = ["V5"]

filenameprocessor/src/file_key_validation.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@
22

33
from re import match
44
from datetime import datetime
5-
from constants import Constants
5+
from constants import VALID_VERSIONS
66
from elasticache import get_valid_vaccine_types_from_cache, get_supplier_system_from_cache
77
from errors import InvalidFileKeyError
88

99

10+
def is_file_in_directory_root(file_key: str) -> bool:
11+
""""
12+
Checks that a given file is in the bucket root rather than a child directory e.g. archive/xyz.csv
13+
"""
14+
return "/" not in file_key
15+
16+
1017
def is_valid_datetime(timestamp: str) -> bool:
1118
"""
1219
Returns a bool to indicate whether the timestamp is a valid datetime in the format 'YYYYmmddTHHMMSSzz'
@@ -53,7 +60,7 @@ def validate_file_key(file_key: str) -> tuple[str, str]:
5360
if not (
5461
vaccine_type in valid_vaccine_types
5562
and vaccination == "VACCINATIONS"
56-
and version in Constants.VALID_VERSIONS
63+
and version in VALID_VERSIONS
5764
and supplier # Note that if supplier could be identified, this also implies that ODS code is valid
5865
and is_valid_datetime(timestamp)
5966
and ((extension == "CSV") or (extension == "DAT")) # The DAT extension has been added for MESH file processing

filenameprocessor/src/file_name_processor.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import argparse
1010
from uuid import uuid4
1111
from utils_for_filenameprocessor import get_created_at_formatted_string, move_file, invoke_filename_lambda
12-
from file_key_validation import validate_file_key
12+
from file_key_validation import validate_file_key, is_file_in_directory_root
1313
from send_sqs_message import make_and_send_sqs_message
1414
from make_and_upload_ack_file import make_and_upload_the_ack_file
1515
from audit_table import upsert_audit_table, get_next_queued_file_details, ensure_file_is_not_a_duplicate
@@ -24,7 +24,7 @@
2424
DuplicateFileError,
2525
UnhandledSqsError,
2626
)
27-
from constants import FileStatus, ERROR_TYPE_TO_STATUS_CODE_MAP
27+
from constants import FileStatus, DATA_SOURCES_BUCKET_SUFFIX, ERROR_TYPE_TO_STATUS_CODE_MAP
2828

2929

3030
# NOTE: logging_decorator is applied to handle_record function, rather than lambda_handler, because
@@ -47,13 +47,14 @@ def handle_record(record) -> dict:
4747
vaccine_type = "unknown"
4848
supplier = "unknown"
4949

50-
if "data-sources" in bucket_name:
50+
if DATA_SOURCES_BUCKET_SUFFIX in bucket_name:
5151

52-
# The lambda is unintentionally invoked when a file is moved into a different folder in the source bucket.
53-
# Excluding file keys containing a "/" is a workaround to prevent the lambda from processing files that
54-
# are not in the root of the source bucket.
55-
if "/" in file_key:
56-
message = "File skipped due to duplicate lambda invoaction"
52+
# In addition to when a batch file is added to the S3 bucket root for processing, this Lambda is also invoked
53+
# when the file is moved to the processing/ directory and finally the /archive directory. We want to ignore
54+
# those events. Unfortunately S3 event filtering does not support triggering for root files only. See VED-781
55+
# for more info.
56+
if not is_file_in_directory_root(file_key):
57+
message = "Processing not required. Event was for a file moved to /archive or /processing"
5758
return {"statusCode": 200, "message": message, "file_key": file_key}
5859

5960
# Set default values for file-specific variables

filenameprocessor/src/logging_decorator.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,19 @@ def send_log_to_firehose(log_data: dict) -> None:
2121

2222

2323
def generate_and_send_logs(
24-
start_time, base_log_data: dict, additional_log_data: dict, is_error_log: bool = False
24+
start_time: float,
25+
base_log_data: dict,
26+
additional_log_data: dict,
27+
use_ms_precision: bool = False,
28+
is_error_log: bool = False
2529
) -> None:
2630
"""Generates log data which includes the base_log_data, additional_log_data, and time taken (calculated using the
2731
current time and given start_time) and sends them to Cloudwatch and Firehose."""
28-
log_data = {**base_log_data, "time_taken": f"{round(time.time() - start_time, 5)}s", **additional_log_data}
32+
seconds_elapsed = time.time() - start_time
33+
formatted_time_elapsed = f"{round(seconds_elapsed * 1000, 5)}ms" if use_ms_precision else \
34+
f"{round(seconds_elapsed, 5)}s"
35+
36+
log_data = {**base_log_data, "time_taken": formatted_time_elapsed, **additional_log_data}
2937
log_function = logger.error if is_error_log else logger.info
3038
log_function(json.dumps(log_data))
3139
send_log_to_firehose(log_data)
@@ -47,12 +55,13 @@ def wrapper(*args, **kwargs):
4755

4856
try:
4957
result = func(*args, **kwargs)
50-
generate_and_send_logs(start_time, base_log_data, additional_log_data=result)
58+
generate_and_send_logs(start_time, base_log_data, additional_log_data=result, use_ms_precision=True)
5159
return result
5260

5361
except Exception as e:
5462
additional_log_data = {"statusCode": 500, "error": str(e)}
55-
generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=True)
63+
generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=True,
64+
use_ms_precision=True)
5665
raise
5766

5867
return wrapper

filenameprocessor/tests/test_file_key_validation.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
# Ensure environment variables are mocked before importing from src files
1111
with patch.dict("os.environ", MOCK_ENVIRONMENT_DICT):
12-
from file_key_validation import is_valid_datetime, validate_file_key
12+
from file_key_validation import is_file_in_directory_root, is_valid_datetime, validate_file_key
1313
from errors import InvalidFileKeyError
1414

1515
VALID_FLU_EMIS_FILE_KEY = MockFileDetails.emis_flu.file_key
@@ -18,9 +18,21 @@
1818

1919
class TestFileKeyValidation(TestCase):
2020
"""Tests for file_key_validation functions"""
21+
def test_is_file_in_directory_root(self):
22+
test_cases = [
23+
("test_file.csv", True),
24+
("archive/test_file.csv", False),
25+
("processing/test_file.csv", False),
26+
("lots/of/directories/init.py", False),
27+
]
28+
29+
for test_file_key, expected in test_cases:
30+
with self.subTest():
31+
self.assertEqual(is_file_in_directory_root(test_file_key), expected)
32+
2133
def test_is_valid_datetime(self):
22-
"Tests that is_valid_datetime returns True for valid datetimes, and false otherwise"
23-
# Test case tuples are stuctured as (date_time_string, expected_result)
34+
"""Tests that is_valid_datetime returns True for valid datetimes, and false otherwise"""
35+
# Test case tuples are structured as (date_time_string, expected_result)
2436
test_cases = [
2537
("20200101T12345600", True), # Valid datetime string with timezone
2638
("20200101T123456", True), # Valid datetime string without timezone

filenameprocessor/tests/test_logging_decorator.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -110,33 +110,33 @@ def test_generate_and_send_logs(self):
110110
additional_log_data = {"additional_key": "additional_value"}
111111
start_time = 1672531200
112112

113-
# CASE: Successful log - is_error_log arg set to False
114-
with ( # noqa: E999
115-
patch("logging_decorator.logger") as mock_logger, # noqa: E999
116-
patch("logging_decorator.send_log_to_firehose") as mock_send_log_to_firehose, # noqa: E999
117-
patch("logging_decorator.time") as mock_time, # noqa: E999
118-
): # noqa: E999
119-
mock_time.time.return_value = 1672531200.123456 # Mocks the end time to be 0.123456s after the start time
120-
generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=False)
121-
122-
expected_log_data = {"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}
123-
log_data = json.loads(mock_logger.info.call_args[0][0])
124-
self.assertEqual(log_data, expected_log_data)
125-
mock_send_log_to_firehose.assert_called_once_with(expected_log_data)
126-
127-
# CASE: Error log - is_error_log arg set to True
128-
with ( # noqa: E999
129-
patch("logging_decorator.logger") as mock_logger, # noqa: E999
130-
patch("logging_decorator.send_log_to_firehose") as mock_send_log_to_firehose, # noqa: E999
131-
patch("logging_decorator.time") as mock_time, # noqa: E999
132-
): # noqa: E999
133-
mock_time.time.return_value = 1672531200.123456 # Mocks the end time to be 0.123456s after the start time
134-
generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=True)
113+
test_cases = [
114+
("Using standard log and seconds precision", False, False,
115+
{"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}),
116+
("Using error log and seconds precision", True, False,
117+
{"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}),
118+
("Using standard log and milliseconds precision", False, True,
119+
{"base_key": "base_value", "time_taken": "123.456ms", "additional_key": "additional_value"})
120+
]
135121

136-
expected_log_data = {"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}
137-
log_data = json.loads(mock_logger.error.call_args[0][0])
138-
self.assertEqual(log_data, expected_log_data)
139-
mock_send_log_to_firehose.assert_called_once_with(expected_log_data)
122+
for test_desc, use_error_log, use_ms_precision, expected_log_data in test_cases:
123+
with self.subTest(test_desc):
124+
with ( # noqa: E999
125+
patch("logging_decorator.logger") as mock_logger, # noqa: E999
126+
patch("logging_decorator.send_log_to_firehose") as mock_send_log_to_firehose, # noqa: E999
127+
patch("logging_decorator.time") as mock_time, # noqa: E999
128+
): # noqa: E999
129+
mock_time.time.return_value = 1672531200.123456 # Mocks end time to be 0.123456s after start
130+
generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=use_error_log,
131+
use_ms_precision=use_ms_precision)
132+
133+
if use_error_log:
134+
log_data = json.loads(mock_logger.error.call_args[0][0])
135+
else:
136+
log_data = json.loads(mock_logger.info.call_args[0][0])
137+
138+
self.assertEqual(log_data, expected_log_data)
139+
mock_send_log_to_firehose.assert_called_once_with(expected_log_data)
140140

141141
def test_logging_successful_validation(self):
142142
"""Tests that the correct logs are sent to cloudwatch and splunk when file validation is successful"""
@@ -156,7 +156,7 @@ def test_logging_successful_validation(self):
156156
expected_log_data = {
157157
"function_name": "filename_processor_handle_record",
158158
"date_time": fixed_datetime.strftime("%Y-%m-%d %H:%M:%S"),
159-
"time_taken": "1.0s",
159+
"time_taken": "1000.0ms",
160160
"statusCode": 200,
161161
"message": "Successfully sent to SQS for further processing",
162162
"file_key": FILE_DETAILS.file_key,
@@ -188,7 +188,7 @@ def test_logging_failed_validation(self):
188188
expected_log_data = {
189189
"function_name": "filename_processor_handle_record",
190190
"date_time": fixed_datetime.strftime("%Y-%m-%d %H:%M:%S"),
191-
"time_taken": "1.0s",
191+
"time_taken": "1000.0ms",
192192
"statusCode": 403,
193193
"message": "Infrastructure Level Response Value - Processing Error",
194194
"file_key": FILE_DETAILS.file_key,

sonar-project.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ sonar.issue.ignore.multicriteria=exclude_snomed_urls,exclude_hl7_urls
99
sonar.issue.ignore.multicriteria.exclude_snomed_urls.ruleKey=python:S5332
1010
sonar.issue.ignore.multicriteria.exclude_snomed_urls.resourceKey=**http://snomed\.info/sct**
1111
sonar.issue.ignore.multicriteria.exclude_hl7_urls.ruleKey=python:S5332
12-
sonar.issue.ignore.multicriteria.exclude_hl7_urls.resourceKey=**http://terminology\.hl7\.org/CodeSystem/v3-NullFlavor**
12+
sonar.issue.ignore.multicriteria.exclude_hl7_urls.resourceKey=**http://terminology\.hl7\.org/CodeSystem/v3-NullFlavor**

0 commit comments

Comments
 (0)