Skip to content

Commit 6eaea79

Browse files
committed
Initial fix
1 parent ac1a001 commit 6eaea79

File tree

3 files changed

+66
-4
lines changed

3 files changed

+66
-4
lines changed

recordprocessor/src/utils_for_recordprocessor.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import os
44
import json
55
from csv import DictReader
6-
from io import TextIOWrapper
6+
from io import BytesIO, TextIOWrapper
7+
from botocore.response import StreamingBody
78
from clients import s3_client, lambda_client, logger
89
from constants import SOURCE_BUCKET_NAME, FILE_NAME_PROC_LAMBDA_NAME
910

@@ -17,12 +18,31 @@ def get_environment() -> str:
1718

1819
def get_csv_content_dict_reader(file_key: str) -> DictReader:
1920
"""Returns the requested file contents from the source bucket in the form of a DictReader"""
20-
response = s3_client.get_object(Bucket=os.getenv("SOURCE_BUCKET_NAME"), Key=file_key)
21-
binary_io = response["Body"]
22-
text_io = TextIOWrapper(binary_io, encoding="utf-8", newline="")
21+
response = s3_client.get_object(Bucket=SOURCE_BUCKET_NAME, Key=file_key)
22+
s3_object_body: StreamingBody = response["Body"]
23+
# Try to seek, otherwise fallback to creating 2 objects
24+
# Should test in real AWS in addition to moto
25+
s3_object_bytes_io = BytesIO(s3_object_body.read())
26+
encoding = "utf-8" if is_utf8(s3_object_bytes_io, file_key) else "windows-1252"
27+
text_io = TextIOWrapper(s3_object_bytes_io, encoding=encoding, newline="")
2328
return DictReader(text_io, delimiter="|")
2429

2530

31+
def is_utf8(file_bytes: BytesIO, file_key: str) -> bool:
32+
"""Best effort attempt to check if the given file is UTF-8. VED-754 some suppliers may provide non UTF-8
33+
encoded CSV files e.g. Windows-1252, so we need to know whether or not to fallback"""
34+
for line in file_bytes:
35+
try:
36+
line.decode("utf-8")
37+
except UnicodeDecodeError:
38+
logger.info("Received a file which was not utf-8 encoded: %s", file_key)
39+
file_bytes.seek(0)
40+
return False
41+
42+
file_bytes.seek(0)
43+
return True
44+
45+
2646
def create_diagnostics_dictionary(error_type, status_code, error_message) -> dict:
2747
"""Returns a dictionary containing the error_type, statusCode, and error_message"""
2848
return {"error_type": error_type, "statusCode": status_code, "error_message": error_message}

recordprocessor/tests/test_recordprocessor_main.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,11 @@ def setUp(self) -> None:
4646
GenericSetUp(s3_client, firehose_client, kinesis_client)
4747

4848
redis_patcher = patch("mappings.redis_client")
49+
util_logger_patcher = patch("utils_for_recordprocessor.logger")
4950
self.addCleanup(redis_patcher.stop)
51+
self.addCleanup(util_logger_patcher.stop)
5052
mock_redis_client = redis_patcher.start()
53+
self.mock_util_logger = util_logger_patcher.start()
5154
mock_redis_client.hget.return_value = json.dumps([{
5255
"code": "55735004",
5356
"term": "Respiratory syncytial virus infection (disorder)"
@@ -358,6 +361,32 @@ def test_e2e_kinesis_failed(self):
358361
}
359362
mock_send_log_to_firehose.assert_called_with(expected_log_data)
360363

364+
def test_e2e_successfully_processes_windows_1252_encoded_file_contents(self):
365+
"""
366+
VED-754 tests the handler successfully processed windows-1252 encoded files that contain special characters
367+
which would otherwise fail for UTF-8. This is a temporary workaround for suppliers using legacy formats.
368+
"""
369+
self.upload_source_files(ValidMockFileContent.with_new_special_char.encode("windows-1252"))
370+
371+
main(mock_rsv_emis_file.event_full_permissions)
372+
373+
# Assertion case tuples are stuctured as
374+
# (test_name, index, expected_kinesis_data_ignoring_fhir_json,expect_success)
375+
assertion_cases = [
376+
(
377+
"CREATE success",
378+
0,
379+
{"operation_requested": "CREATE", "local_id": MockLocalIds.RSV_001_RAVS},
380+
True,
381+
)
382+
]
383+
self.make_inf_ack_assertions(file_details=mock_rsv_emis_file, passed_validation=True)
384+
self.make_kinesis_assertions(assertion_cases)
385+
self.mock_util_logger.info.assert_called_once_with(
386+
"Received a file which was not utf-8 encoded: %s",
387+
mock_rsv_emis_file.file_key
388+
)
389+
361390

362391
if __name__ == "__main__":
363392
unittest.main()

recordprocessor/tests/utils_for_recordprocessor_tests/values_for_recordprocessor_tests.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,12 +112,25 @@ class MockFileRows:
112112
'"J82068"|"https://fhir.nhs.uk/Id/ods-organization-code"'
113113
)
114114

115+
# For test case VED-754 - windows-1252 encoding issues only surfaces with characters outside of 0-127 ASCII
116+
NEW_WITH_SPECIAL_CHARACTERS = (
117+
'9674963871|"SABINA"|"GRÉIR"|"20190131"|"2"|"GU14 6TU"|"20240610T183325"|"J82067"|'
118+
f'"https://fhir.nhs.uk/Id/ods-organization-code"|"{MockUniqueIds.RSV_001}"|"{MockUniqueIdUris.RAVS}"|'
119+
'"new"|"Ellena"|"O\'Reilly"|"20240101"|"TRUE"|'
120+
'"1303503001"|"Administration of vaccine product containing only Human orthopneumovirus antigen (procedure)"|'
121+
'1|"42605811000001109"|"Abrysvo vaccine powder and solvent for solution for injection 0.5ml vials (Pfizer Ltd) '
122+
'(product)"|"Pfizer"|"RSVTEST"|"20241231"|"368208006"|"Left upper arm structure (body structure)"|'
123+
'"78421000"|"Intramuscular route (qualifier value)"|"0.5"|"258773002"|"Milliliter (qualifier value)"|"Test"|'
124+
'"J82067"|"https://fhir.nhs.uk/Id/ods-organization-code"'
125+
)
126+
115127

116128
class ValidMockFileContent:
117129
"""Class containing valid file content for use in tests"""
118130

119131
headers = MockFileRows.HEADERS
120132
with_new = headers + "\n" + MockFileRows.NEW
133+
with_new_special_char = headers + "\n" + MockFileRows.NEW_WITH_SPECIAL_CHARACTERS
121134
with_update = headers + "\n" + MockFileRows.UPDATE
122135
with_delete = headers + "\n" + MockFileRows.DELETE
123136
with_update_and_delete = headers + "\n" + MockFileRows.UPDATE + "\n" + MockFileRows.DELETE

0 commit comments

Comments
 (0)