VED-718 Use ms precision in filename processor and minor refactor (#741)

dlzhry2nhs · mfjarvis · commit 8ea975155e76 · 2025-08-29T00:12:32.000+01:00
diff --git a/filenameprocessor/src/constants.py b/filenameprocessor/src/constants.py
@@ -1,6 +1,8 @@
 """Constants for the filenameprocessor lambda"""
 
 import os
+from enum import StrEnum
+
 from errors import (
     VaccineTypePermissionsError,
     InvalidFileKeyError,
@@ -15,6 +17,8 @@
 AUDIT_TABLE_NAME = os.getenv("AUDIT_TABLE_NAME")
 AUDIT_TABLE_QUEUE_NAME_GSI = "queue_name_index"
 AUDIT_TABLE_FILENAME_GSI = "filename_index"
+DATA_SOURCES_BUCKET_SUFFIX = "data-sources"
+VALID_VERSIONS = ["V5"]
 
 SUPPLIER_PERMISSIONS_HASH_KEY = "supplier_permissions"
 VACCINE_TYPE_TO_DISEASES_HASH_KEY = "vacc_to_diseases"
@@ -31,7 +35,7 @@
 }
 
 
-class FileStatus:
+class FileStatus(StrEnum):
     """File status constants"""
 
     QUEUED = "Queued"
@@ -40,16 +44,11 @@ class FileStatus:
     DUPLICATE = "Not processed - duplicate"
 
 
-class AuditTableKeys:
+class AuditTableKeys(StrEnum):
     """Audit table keys"""
 
     FILENAME = "filename"
     MESSAGE_ID = "message_id"
     QUEUE_NAME = "queue_name"
     STATUS = "status"
     TIMESTAMP = "timestamp"
-
-
-class Constants:
-    """Constants for the filenameprocessor lambda"""
-    VALID_VERSIONS = ["V5"]
diff --git a/filenameprocessor/src/file_key_validation.py b/filenameprocessor/src/file_key_validation.py
@@ -2,11 +2,18 @@
 
 from re import match
 from datetime import datetime
-from constants import Constants
+from constants import VALID_VERSIONS
 from elasticache import get_valid_vaccine_types_from_cache, get_supplier_system_from_cache
 from errors import InvalidFileKeyError
 
 
+def is_file_in_directory_root(file_key: str) -> bool:
+    """"
+    Checks that a given file is in the bucket root rather than a child directory e.g. archive/xyz.csv
+    """
+    return "/" not in file_key
+
+
 def is_valid_datetime(timestamp: str) -> bool:
     """
     Returns a bool to indicate whether the timestamp is a valid datetime in the format 'YYYYmmddTHHMMSSzz'
@@ -53,7 +60,7 @@ def validate_file_key(file_key: str) -> tuple[str, str]:
     if not (
         vaccine_type in valid_vaccine_types
         and vaccination == "VACCINATIONS"
-        and version in Constants.VALID_VERSIONS
+        and version in VALID_VERSIONS
         and supplier  # Note that if supplier could be identified, this also implies that ODS code is valid
         and is_valid_datetime(timestamp)
         and ((extension == "CSV") or (extension == "DAT"))  # The DAT extension has been added for MESH file processing
diff --git a/filenameprocessor/src/file_name_processor.py b/filenameprocessor/src/file_name_processor.py
@@ -9,7 +9,7 @@
 import argparse
 from uuid import uuid4
 from utils_for_filenameprocessor import get_created_at_formatted_string, move_file, invoke_filename_lambda
-from file_key_validation import validate_file_key
+from file_key_validation import validate_file_key, is_file_in_directory_root
 from send_sqs_message import make_and_send_sqs_message
 from make_and_upload_ack_file import make_and_upload_the_ack_file
 from audit_table import upsert_audit_table, get_next_queued_file_details, ensure_file_is_not_a_duplicate
@@ -24,7 +24,7 @@
     DuplicateFileError,
     UnhandledSqsError,
 )
-from constants import FileStatus, ERROR_TYPE_TO_STATUS_CODE_MAP
+from constants import FileStatus, DATA_SOURCES_BUCKET_SUFFIX, ERROR_TYPE_TO_STATUS_CODE_MAP
 
 
 # NOTE: logging_decorator is applied to handle_record function, rather than lambda_handler, because
@@ -47,13 +47,14 @@ def handle_record(record) -> dict:
     vaccine_type = "unknown"
     supplier = "unknown"
 
-    if "data-sources" in bucket_name:
+    if DATA_SOURCES_BUCKET_SUFFIX in bucket_name:
 
-        # The lambda is unintentionally invoked when a file is moved into a different folder in the source bucket.
-        # Excluding file keys containing a "/" is a workaround to prevent the lambda from processing files that
-        # are not in the root of the source bucket.
-        if "/" in file_key:
-            message = "File skipped due to duplicate lambda invoaction"
+        # In addition to when a batch file is added to the S3 bucket root for processing, this Lambda is also invoked
+        # when the file is moved to the processing/ directory and finally the /archive directory. We want to ignore
+        # those events. Unfortunately S3 event filtering does not support triggering for root files only. See VED-781
+        # for more info.
+        if not is_file_in_directory_root(file_key):
+            message = "Processing not required. Event was for a file moved to /archive or /processing"
             return {"statusCode": 200, "message": message, "file_key": file_key}
 
         # Set default values for file-specific variables
diff --git a/filenameprocessor/src/logging_decorator.py b/filenameprocessor/src/logging_decorator.py
@@ -21,11 +21,19 @@ def send_log_to_firehose(log_data: dict) -> None:
 
 
 def generate_and_send_logs(
-    start_time, base_log_data: dict, additional_log_data: dict, is_error_log: bool = False
+    start_time: float,
+    base_log_data: dict,
+    additional_log_data: dict,
+    use_ms_precision: bool = False,
+    is_error_log: bool = False
 ) -> None:
     """Generates log data which includes the base_log_data, additional_log_data, and time taken (calculated using the
     current time and given start_time) and sends them to Cloudwatch and Firehose."""
-    log_data = {**base_log_data, "time_taken": f"{round(time.time() - start_time, 5)}s", **additional_log_data}
+    seconds_elapsed = time.time() - start_time
+    formatted_time_elapsed = f"{round(seconds_elapsed * 1000, 5)}ms" if use_ms_precision else \
+        f"{round(seconds_elapsed, 5)}s"
+
+    log_data = {**base_log_data, "time_taken": formatted_time_elapsed, **additional_log_data}
     log_function = logger.error if is_error_log else logger.info
     log_function(json.dumps(log_data))
     send_log_to_firehose(log_data)
@@ -47,12 +55,13 @@ def wrapper(*args, **kwargs):
 
         try:
             result = func(*args, **kwargs)
-            generate_and_send_logs(start_time, base_log_data, additional_log_data=result)
+            generate_and_send_logs(start_time, base_log_data, additional_log_data=result, use_ms_precision=True)
             return result
 
         except Exception as e:
             additional_log_data = {"statusCode": 500, "error": str(e)}
-            generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=True)
+            generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=True,
+                                   use_ms_precision=True)
             raise
 
     return wrapper
diff --git a/filenameprocessor/tests/test_file_key_validation.py b/filenameprocessor/tests/test_file_key_validation.py
@@ -9,7 +9,7 @@
 
 # Ensure environment variables are mocked before importing from src files
 with patch.dict("os.environ", MOCK_ENVIRONMENT_DICT):
-    from file_key_validation import is_valid_datetime, validate_file_key
+    from file_key_validation import is_file_in_directory_root, is_valid_datetime, validate_file_key
     from errors import InvalidFileKeyError
 
 VALID_FLU_EMIS_FILE_KEY = MockFileDetails.emis_flu.file_key
@@ -18,9 +18,21 @@
 
 class TestFileKeyValidation(TestCase):
     """Tests for file_key_validation functions"""
+    def test_is_file_in_directory_root(self):
+        test_cases = [
+            ("test_file.csv", True),
+            ("archive/test_file.csv", False),
+            ("processing/test_file.csv", False),
+            ("lots/of/directories/init.py", False),
+        ]
+
+        for test_file_key, expected in test_cases:
+            with self.subTest():
+                self.assertEqual(is_file_in_directory_root(test_file_key), expected)
+
     def test_is_valid_datetime(self):
-        "Tests that is_valid_datetime returns True for valid datetimes, and false otherwise"
-        # Test case tuples are stuctured as (date_time_string, expected_result)
+        """Tests that is_valid_datetime returns True for valid datetimes, and false otherwise"""
+        # Test case tuples are structured as (date_time_string, expected_result)
         test_cases = [
             ("20200101T12345600", True),  # Valid datetime string with timezone
             ("20200101T123456", True),  # Valid datetime string without timezone
diff --git a/filenameprocessor/tests/test_logging_decorator.py b/filenameprocessor/tests/test_logging_decorator.py
@@ -110,33 +110,33 @@ def test_generate_and_send_logs(self):
         additional_log_data = {"additional_key": "additional_value"}
         start_time = 1672531200
 
-        # CASE: Successful log - is_error_log arg set to False
-        with (  # noqa: E999
-            patch("logging_decorator.logger") as mock_logger,  # noqa: E999
-            patch("logging_decorator.send_log_to_firehose") as mock_send_log_to_firehose,  # noqa: E999
-            patch("logging_decorator.time") as mock_time,  # noqa: E999
-        ):  # noqa: E999
-            mock_time.time.return_value = 1672531200.123456  # Mocks the end time to be 0.123456s after the start time
-            generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=False)
-
-        expected_log_data = {"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}
-        log_data = json.loads(mock_logger.info.call_args[0][0])
-        self.assertEqual(log_data, expected_log_data)
-        mock_send_log_to_firehose.assert_called_once_with(expected_log_data)
-
-        # CASE: Error log - is_error_log arg set to True
-        with (  # noqa: E999
-            patch("logging_decorator.logger") as mock_logger,  # noqa: E999
-            patch("logging_decorator.send_log_to_firehose") as mock_send_log_to_firehose,  # noqa: E999
-            patch("logging_decorator.time") as mock_time,  # noqa: E999
-        ):  # noqa: E999
-            mock_time.time.return_value = 1672531200.123456  # Mocks the end time to be 0.123456s after the start time
-            generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=True)
+        test_cases = [
+            ("Using standard log and seconds precision", False, False,
+             {"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}),
+            ("Using error log and seconds precision", True, False,
+             {"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}),
+            ("Using standard log and milliseconds precision", False, True,
+             {"base_key": "base_value", "time_taken": "123.456ms", "additional_key": "additional_value"})
+        ]
 
-        expected_log_data = {"base_key": "base_value", "time_taken": "0.12346s", "additional_key": "additional_value"}
-        log_data = json.loads(mock_logger.error.call_args[0][0])
-        self.assertEqual(log_data, expected_log_data)
-        mock_send_log_to_firehose.assert_called_once_with(expected_log_data)
+        for test_desc, use_error_log, use_ms_precision, expected_log_data in test_cases:
+            with self.subTest(test_desc):
+                with (  # noqa: E999
+                    patch("logging_decorator.logger") as mock_logger,  # noqa: E999
+                    patch("logging_decorator.send_log_to_firehose") as mock_send_log_to_firehose,  # noqa: E999
+                    patch("logging_decorator.time") as mock_time,  # noqa: E999
+                ):  # noqa: E999
+                    mock_time.time.return_value = 1672531200.123456  # Mocks end time to be 0.123456s after start
+                    generate_and_send_logs(start_time, base_log_data, additional_log_data, is_error_log=use_error_log,
+                                           use_ms_precision=use_ms_precision)
+
+                    if use_error_log:
+                        log_data = json.loads(mock_logger.error.call_args[0][0])
+                    else:
+                        log_data = json.loads(mock_logger.info.call_args[0][0])
+
+                    self.assertEqual(log_data, expected_log_data)
+                    mock_send_log_to_firehose.assert_called_once_with(expected_log_data)
 
     def test_logging_successful_validation(self):
         """Tests that the correct logs are sent to cloudwatch and splunk when file validation is successful"""
@@ -156,7 +156,7 @@ def test_logging_successful_validation(self):
         expected_log_data = {
             "function_name": "filename_processor_handle_record",
             "date_time": fixed_datetime.strftime("%Y-%m-%d %H:%M:%S"),
-            "time_taken": "1.0s",
+            "time_taken": "1000.0ms",
             "statusCode": 200,
             "message": "Successfully sent to SQS for further processing",
             "file_key": FILE_DETAILS.file_key,
@@ -188,7 +188,7 @@ def test_logging_failed_validation(self):
         expected_log_data = {
             "function_name": "filename_processor_handle_record",
             "date_time": fixed_datetime.strftime("%Y-%m-%d %H:%M:%S"),
-            "time_taken": "1.0s",
+            "time_taken": "1000.0ms",
             "statusCode": 403,
             "message": "Infrastructure Level Response Value - Processing Error",
             "file_key": FILE_DETAILS.file_key,
diff --git a/sonar-project.properties b/sonar-project.properties
@@ -9,4 +9,4 @@ sonar.issue.ignore.multicriteria=exclude_snomed_urls,exclude_hl7_urls
 sonar.issue.ignore.multicriteria.exclude_snomed_urls.ruleKey=python:S5332
 sonar.issue.ignore.multicriteria.exclude_snomed_urls.resourceKey=**http://snomed\.info/sct**
 sonar.issue.ignore.multicriteria.exclude_hl7_urls.ruleKey=python:S5332
-sonar.issue.ignore.multicriteria.exclude_hl7_urls.resourceKey=**http://terminology\.hl7\.org/CodeSystem/v3-NullFlavor**
+sonar.issue.ignore.multicriteria.exclude_hl7_urls.resourceKey=**http://terminology\.hl7\.org/CodeSystem/v3-NullFlavor**