Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions recordprocessor/src/utils_for_recordprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import json
from csv import DictReader
from io import TextIOWrapper
from io import BytesIO, TextIOWrapper
from clients import s3_client, lambda_client, logger
from constants import SOURCE_BUCKET_NAME, FILE_NAME_PROC_LAMBDA_NAME

Expand All @@ -17,12 +17,28 @@ def get_environment() -> str:

def get_csv_content_dict_reader(file_key: str) -> DictReader:
"""Returns the requested file contents from the source bucket in the form of a DictReader"""
response = s3_client.get_object(Bucket=os.getenv("SOURCE_BUCKET_NAME"), Key=file_key)
binary_io = response["Body"]
text_io = TextIOWrapper(binary_io, encoding="utf-8", newline="")
response = s3_client.get_object(Bucket=SOURCE_BUCKET_NAME, Key=file_key)
s3_object_bytes_io = BytesIO(response["Body"].read())
encoding = "utf-8" if is_utf8(s3_object_bytes_io, file_key) else "windows-1252"
text_io = TextIOWrapper(s3_object_bytes_io, encoding=encoding, newline="")
return DictReader(text_io, delimiter="|")


def is_utf8(file_bytes: BytesIO, file_key: str) -> bool:
"""Best effort attempt to check if the given file is UTF-8. VED-754 some suppliers may provide non UTF-8
encoded CSV files e.g. Windows-1252, so we need to know whether or not to fallback"""
for line in file_bytes:
try:
line.decode("utf-8")
except UnicodeDecodeError:
logger.info("Received a file which was not utf-8 encoded: %s", file_key)
file_bytes.seek(0)
return False

file_bytes.seek(0)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For discussion, this is a bit naughty. It avoids the function having a side effect of fast forwarding the file to a certain point and ensures it always sets the stream back to the start.

Other option would be to create to BytesIO object. One for encoding validation, and then one to pass to the dict reader. However, concluded minimising the objects is better even is we have to manually call .seek(0).

return True


def create_diagnostics_dictionary(error_type, status_code, error_message) -> dict:
"""Returns a dictionary containing the error_type, statusCode, and error_message"""
return {"error_type": error_type, "statusCode": status_code, "error_message": error_message}
Expand Down
29 changes: 29 additions & 0 deletions recordprocessor/tests/test_recordprocessor_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@ def setUp(self) -> None:
GenericSetUp(s3_client, firehose_client, kinesis_client)

redis_patcher = patch("mappings.redis_client")
util_logger_patcher = patch("utils_for_recordprocessor.logger")
self.addCleanup(redis_patcher.stop)
self.addCleanup(util_logger_patcher.stop)
mock_redis_client = redis_patcher.start()
self.mock_util_logger = util_logger_patcher.start()
mock_redis_client.hget.return_value = json.dumps([{
"code": "55735004",
"term": "Respiratory syncytial virus infection (disorder)"
Expand Down Expand Up @@ -358,6 +361,32 @@ def test_e2e_kinesis_failed(self):
}
mock_send_log_to_firehose.assert_called_with(expected_log_data)

def test_e2e_successfully_processes_windows_1252_encoded_file_contents(self):
"""
VED-754 tests the handler successfully processed windows-1252 encoded files that contain special characters
which would otherwise fail for UTF-8. This is a temporary workaround for suppliers using legacy formats.
"""
self.upload_source_files(ValidMockFileContent.with_new_special_char.encode("windows-1252"))

main(mock_rsv_emis_file.event_full_permissions)

# Assertion case tuples are stuctured as
# (test_name, index, expected_kinesis_data_ignoring_fhir_json,expect_success)
assertion_cases = [
(
"CREATE success",
0,
{"operation_requested": "CREATE", "local_id": MockLocalIds.RSV_001_RAVS},
True,
)
]
self.make_inf_ack_assertions(file_details=mock_rsv_emis_file, passed_validation=True)
self.make_kinesis_assertions(assertion_cases)
self.mock_util_logger.info.assert_called_once_with(
"Received a file which was not utf-8 encoded: %s",
mock_rsv_emis_file.file_key
)


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,25 @@ class MockFileRows:
'"J82068"|"https://fhir.nhs.uk/Id/ods-organization-code"'
)

# For test case VED-754 - windows-1252 encoding issues only surfaces with characters outside of 0-127 ASCII
NEW_WITH_SPECIAL_CHARACTERS = (
'9674963871|"SABINA"|"GRÉIR"|"20190131"|"2"|"GU14 6TU"|"20240610T183325"|"J82067"|'
f'"https://fhir.nhs.uk/Id/ods-organization-code"|"{MockUniqueIds.RSV_001}"|"{MockUniqueIdUris.RAVS}"|'
'"new"|"Ellena"|"O\'Reilly"|"20240101"|"TRUE"|'
'"1303503001"|"Administration of vaccine product containing only Human orthopneumovirus antigen (procedure)"|'
'1|"42605811000001109"|"Abrysvo vaccine powder and solvent for solution for injection 0.5ml vials (Pfizer Ltd) '
'(product)"|"Pfizer"|"RSVTEST"|"20241231"|"368208006"|"Left upper arm structure (body structure)"|'
'"78421000"|"Intramuscular route (qualifier value)"|"0.5"|"258773002"|"Milliliter (qualifier value)"|"Test"|'
'"J82067"|"https://fhir.nhs.uk/Id/ods-organization-code"'
)


class ValidMockFileContent:
"""Class containing valid file content for use in tests"""

headers = MockFileRows.HEADERS
with_new = headers + "\n" + MockFileRows.NEW
with_new_special_char = headers + "\n" + MockFileRows.NEW_WITH_SPECIAL_CHARACTERS
with_update = headers + "\n" + MockFileRows.UPDATE
with_delete = headers + "\n" + MockFileRows.DELETE
with_update_and_delete = headers + "\n" + MockFileRows.UPDATE + "\n" + MockFileRows.DELETE
Expand Down
Loading