Skip to content

Commit 94b348f

Browse files
committed
Skip file encoding check for internal systems
1 parent 93448ef commit 94b348f

File tree

6 files changed

+72
-18
lines changed

6 files changed

+72
-18
lines changed

recordprocessor/src/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@
4848
]
4949

5050

51+
INTERNAL_SUPPLIER_NAMES = {"DPSFULL", "DPSREDUCED"}
52+
53+
5154
class FileStatus:
5255
"""File status constants"""
5356

recordprocessor/src/file_level_validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def file_level_validation(incoming_message_body: dict) -> dict:
7878
created_at_formatted_string = incoming_message_body.get("created_at_formatted_string")
7979

8080
# Fetch the data
81-
csv_reader = get_csv_content_dict_reader(file_key)
81+
csv_reader = get_csv_content_dict_reader(file_key, supplier)
8282

8383
validate_content_headers(csv_reader)
8484

recordprocessor/src/utils_for_recordprocessor.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from csv import DictReader
66
from io import BytesIO, TextIOWrapper
77
from clients import s3_client, lambda_client, logger
8-
from constants import SOURCE_BUCKET_NAME, FILE_NAME_PROC_LAMBDA_NAME
8+
from constants import SOURCE_BUCKET_NAME, FILE_NAME_PROC_LAMBDA_NAME, INTERNAL_SUPPLIER_NAMES
99

1010

1111
def get_environment() -> str:
@@ -15,18 +15,24 @@ def get_environment() -> str:
1515
return _env if _env in ["internal-dev", "int", "ref", "sandbox", "prod"] else "internal-dev"
1616

1717

18-
def get_csv_content_dict_reader(file_key: str) -> DictReader:
18+
def get_csv_content_dict_reader(file_key: str, supplier: str) -> DictReader:
1919
"""Returns the requested file contents from the source bucket in the form of a DictReader"""
2020
response = s3_client.get_object(Bucket=SOURCE_BUCKET_NAME, Key=file_key)
2121
s3_object_bytes_io = BytesIO(response["Body"].read())
22-
encoding = "utf-8" if is_utf8(s3_object_bytes_io, file_key) else "windows-1252"
22+
encoding = "utf-8"
23+
24+
# Skip encoding check if supplier is internal e.g. DPS
25+
if supplier not in INTERNAL_SUPPLIER_NAMES:
26+
if not is_utf8(s3_object_bytes_io, file_key):
27+
encoding = "windows-1252"
28+
2329
text_io = TextIOWrapper(s3_object_bytes_io, encoding=encoding, newline="")
2430
return DictReader(text_io, delimiter="|")
2531

2632

2733
def is_utf8(file_bytes: BytesIO, file_key: str) -> bool:
28-
"""Best effort attempt to check if the given file is UTF-8. VED-754 some suppliers may provide non UTF-8
29-
encoded CSV files e.g. Windows-1252, so we need to know whether or not to fallback"""
34+
"""Checks if the given file is UTF-8. VED-754 some suppliers may provide non UTF-8encoded CSV files
35+
e.g. Windows-1252, so we need to know whether or not to fallback"""
3036
for line in file_bytes:
3137
try:
3238
line.decode("utf-8")

recordprocessor/tests/test_recordprocessor_main.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,13 @@ def tearDown(self) -> None:
6060
GenericTearDown(s3_client, firehose_client, kinesis_client)
6161

6262
@staticmethod
63-
def upload_source_files(source_file_content): # pylint: disable=dangerous-default-value
64-
"""Uploads a test file with the TEST_FILE_KEY (RSV EMIS file) the given file content to the source bucket"""
65-
s3_client.put_object(Bucket=BucketNames.SOURCE, Key=mock_rsv_emis_file.file_key, Body=source_file_content)
63+
def upload_source_files(
64+
source_file_content: str,
65+
file_key: str = mock_rsv_emis_file.file_key
66+
):
67+
"""Uploads a test file with the TEST_FILE_KEY (RSV EMIS file as default) and the given file content to the
68+
source bucket"""
69+
s3_client.put_object(Bucket=BucketNames.SOURCE, Key=file_key, Body=source_file_content)
6670

6771
@staticmethod
6872
def get_shard_iterator():
@@ -96,7 +100,7 @@ def make_inf_ack_assertions(self, file_details: FileDetails, passed_validation:
96100

97101
self.assertEqual(actual_rows, [InfAckFileRows.HEADERS, expected_row])
98102

99-
def make_kinesis_assertions(self, test_cases):
103+
def make_kinesis_assertions(self, test_cases, file_details: FileDetails = mock_rsv_emis_file):
100104
"""
101105
The input is a list of test_case tuples where each tuple is structured as
102106
(test_name, index, expected_kinesis_data_ignoring_fhir_json, expect_success).
@@ -122,7 +126,7 @@ def make_kinesis_assertions(self, test_cases):
122126
with self.subTest(test_name):
123127

124128
kinesis_record = kinesis_records[index]
125-
self.assertEqual(kinesis_record["PartitionKey"], mock_rsv_emis_file.queue_name)
129+
self.assertEqual(kinesis_record["PartitionKey"], file_details.queue_name)
126130
self.assertEqual(kinesis_record["SequenceNumber"], f"{index+1}")
127131

128132
# Ensure that arrival times are sequential
@@ -132,11 +136,11 @@ def make_kinesis_assertions(self, test_cases):
132136

133137
kinesis_data = json.loads(kinesis_record["Data"].decode("utf-8"), parse_float=Decimal)
134138
expected_kinesis_data = {
135-
"row_id": f"{mock_rsv_emis_file.message_id}^{index+1}",
136-
"file_key": mock_rsv_emis_file.file_key,
137-
"supplier": mock_rsv_emis_file.supplier,
138-
"vax_type": mock_rsv_emis_file.vaccine_type,
139-
"created_at_formatted_string": mock_rsv_emis_file.created_at_formatted_string,
139+
"row_id": f"{file_details.message_id}^{index+1}",
140+
"file_key": file_details.file_key,
141+
"supplier": file_details.supplier,
142+
"vax_type": file_details.vaccine_type,
143+
"created_at_formatted_string": file_details.created_at_formatted_string,
140144
**expected_kinesis_data,
141145
}
142146
if expect_success and "fhir_json" not in expected_kinesis_data:
@@ -370,7 +374,7 @@ def test_e2e_successfully_processes_windows_1252_encoded_file_contents(self):
370374

371375
main(mock_rsv_emis_file.event_full_permissions)
372376

373-
# Assertion case tuples are stuctured as
377+
# Assertion case tuples are structured as
374378
# (test_name, index, expected_kinesis_data_ignoring_fhir_json,expect_success)
375379
assertion_cases = [
376380
(
@@ -387,6 +391,34 @@ def test_e2e_successfully_processes_windows_1252_encoded_file_contents(self):
387391
mock_rsv_emis_file.file_key
388392
)
389393

394+
@patch("utils_for_recordprocessor.is_utf8")
395+
def test_e2e_skips_encoding_check_for_internal_suppliers(self, mock_utf_checker):
396+
"""
397+
VED-754 tests the handler successfully processes the file and skips the encoding check when the supplier is
398+
internal e.g. DPS. This is because we know they will be using the desired utf-8 encoding and avoids the extra
399+
processing time to validate the encoding. They will also be providing very large batch files so skipping this is
400+
very important.
401+
"""
402+
self.upload_source_files(ValidMockFileContent.with_new_dps_record, MockFileDetails.dps_flu.file_key)
403+
404+
main(MockFileDetails.dps_flu.event_full_permissions)
405+
406+
# Assertion case tuples are structured as
407+
# (test_name, index, expected_kinesis_data_ignoring_fhir_json,expect_success)
408+
assertion_cases = [
409+
(
410+
"CREATE success",
411+
0,
412+
{"operation_requested": "CREATE", "local_id": "DPS_ID_1234^DPS_SYSTEM"},
413+
True,
414+
)
415+
]
416+
self.make_inf_ack_assertions(file_details=MockFileDetails.dps_flu, passed_validation=True)
417+
self.mock_util_logger.info.assert_not_called()
418+
mock_utf_checker.assert_not_called()
419+
self.make_kinesis_assertions(assertion_cases, file_details=MockFileDetails.dps_flu)
420+
self.mock_util_logger.info.assert_not_called()
421+
390422

391423
if __name__ == "__main__":
392424
unittest.main()

recordprocessor/tests/test_utils_for_recordprocessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def test_get_csv_content_dict_reader(self):
4848
"""Tests that get_csv_content_dict_reader returns the correct csv data"""
4949
self.upload_source_file(test_file.file_key, ValidMockFileContent.with_new_and_update)
5050
expected_output = csv.DictReader(StringIO(ValidMockFileContent.with_new_and_update), delimiter="|")
51-
result = get_csv_content_dict_reader(test_file.file_key)
51+
result = get_csv_content_dict_reader(test_file.file_key, "RAVS")
5252
self.assertEqual(list(result), list(expected_output))
5353

5454
def test_get_environment(self):

recordprocessor/tests/utils_for_recordprocessor_tests/values_for_recordprocessor_tests.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,17 @@ class MockFileRows:
112112
'"J82068"|"https://fhir.nhs.uk/Id/ods-organization-code"'
113113
)
114114

115+
NEW_DPS_RECORD = (
116+
'9674963871|"SABINA"|"GREIR"|"20190131"|"2"|"GU14 6TU"|"20240610T183325"|"J82067"|'
117+
'"https://fhir.nhs.uk/Id/ods-organization-code"|"DPS_ID_1234"|"DPS_SYSTEM"|'
118+
'"new"|"Ellena"|"O\'Reilly"|"20240101"|"TRUE"|'
119+
'"1303503001"|"Administration of vaccine product containing only Human orthopneumovirus antigen (procedure)"|'
120+
'1|"42605811000001109"|"Abrysvo vaccine powder and solvent for solution for injection 0.5ml vials (Pfizer Ltd) '
121+
'(product)"|"Pfizer"|"RSVTEST"|"20241231"|"368208006"|"Left upper arm structure (body structure)"|'
122+
'"78421000"|"Intramuscular route (qualifier value)"|"0.5"|"258773002"|"Milliliter (qualifier value)"|"Test"|'
123+
'"J82067"|"https://fhir.nhs.uk/Id/ods-organization-code"'
124+
)
125+
115126
# For test case VED-754 - windows-1252 encoding issues only surfaces with characters outside of 0-127 ASCII
116127
NEW_WITH_SPECIAL_CHARACTERS = (
117128
'9674963871|"SABINA"|"GRÉIR"|"20190131"|"2"|"GU14 6TU"|"20240610T183325"|"J82067"|'
@@ -131,6 +142,7 @@ class ValidMockFileContent:
131142
headers = MockFileRows.HEADERS
132143
with_new = headers + "\n" + MockFileRows.NEW
133144
with_new_special_char = headers + "\n" + MockFileRows.NEW_WITH_SPECIAL_CHARACTERS
145+
with_new_dps_record = headers + "\n" + MockFileRows.NEW_DPS_RECORD
134146
with_update = headers + "\n" + MockFileRows.UPDATE
135147
with_delete = headers + "\n" + MockFileRows.DELETE
136148
with_update_and_delete = headers + "\n" + MockFileRows.UPDATE + "\n" + MockFileRows.DELETE
@@ -210,6 +222,7 @@ class MockFileDetails:
210222
rsv_emis = FileDetails("RSV", "EMIS", "8HK48")
211223
flu_emis = FileDetails("FLU", "EMIS", "YGM41")
212224
ravs_flu = FileDetails("FLU", "RSV", "X26")
225+
dps_flu = FileDetails("FLU", "DPSFULL", "DPSFULL")
213226

214227

215228
class UnorderedFieldDictionaries:

0 commit comments

Comments
 (0)