Skip to content

Commit bd6dee4

Browse files
committed
VED-763: Append checksum to filename in MESH processor to support duplicates with different content.
1 parent 5d3555a commit bd6dee4

File tree

4 files changed

+121
-61
lines changed

4 files changed

+121
-61
lines changed

filenameprocessor/src/file_validation.py

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,34 +38,54 @@ def validate_file_key(file_key: str) -> tuple[str, str]:
3838
Checks that all elements of the file key are valid, raises an exception otherwise.
3939
Returns a tuple containing the vaccine_type and supplier (both converted to upper case).
4040
"""
41+
file_key = file_key.upper()
4142

42-
if not match(r"^[^_.]*_[^_.]*_[^_.]*_[^_.]*_[^_.]*\.[^_.]*$", file_key):
43-
error_message = "Initial file validation failed: invalid file key format"
44-
raise InvalidFileKeyError(error_message)
43+
file_name_and_extension = file_key.rsplit(".", 1)
44+
if len(file_name_and_extension) != 2:
45+
raise InvalidFileKeyError("Initial file validation failed: missing file extension")
4546

46-
file_key = file_key.upper()
47-
file_key_parts_without_extension = file_key.split(".")[0].split("_")
47+
file_key_parts_without_extension = file_name_and_extension[0].split("_")
48+
if len(file_key_parts_without_extension) < 5:
49+
raise InvalidFileKeyError("Initial file validation failed: not enough parts in file key")
4850

4951
vaccine_type = file_key_parts_without_extension[0]
5052
vaccination = file_key_parts_without_extension[1]
5153
version = file_key_parts_without_extension[2]
5254
ods_code = file_key_parts_without_extension[3]
5355
timestamp = file_key_parts_without_extension[4]
54-
extension = file_key.split(".")[1]
56+
extension = file_name_and_extension[1]
5557
supplier = get_supplier_system_from_cache(ods_code)
5658

5759
valid_vaccine_types = get_valid_vaccine_types_from_cache()
5860

5961
# Validate each file key element
60-
if not (
61-
vaccine_type in valid_vaccine_types
62-
and vaccination == "VACCINATIONS"
63-
and version in VALID_VERSIONS
64-
and supplier # Note that if supplier could be identified, this also implies that ODS code is valid
65-
and is_valid_datetime(timestamp)
66-
and ((extension == "CSV") or (extension == "DAT")) # The DAT extension has been added for MESH file processing
67-
):
68-
raise InvalidFileKeyError("Initial file validation failed: invalid file key")
62+
# if not (
63+
# vaccine_type in valid_vaccine_types
64+
# and vaccination == "VACCINATIONS"
65+
# and version in VALID_VERSIONS
66+
# and supplier # Note that if supplier could be identified, this also implies that ODS code is valid
67+
# and is_valid_datetime(timestamp)
68+
# and ((extension == "CSV") or (extension == "DAT")) # The DAT extension has been added for MESH file processing
69+
# ):
70+
# raise InvalidFileKeyError("Initial file validation failed: invalid file key")
71+
#
72+
if vaccine_type not in valid_vaccine_types:
73+
raise InvalidFileKeyError("Initial file validation failed: invalid vaccine type")
74+
75+
if vaccination != "VACCINATIONS":
76+
raise InvalidFileKeyError("Initial file validation failed: file key must contain VACCINATIONS")
77+
78+
if version not in VALID_VERSIONS:
79+
raise InvalidFileKeyError("Initial file validation failed: invalid file version")
80+
81+
if not supplier:
82+
raise InvalidFileKeyError("Initial file validation failed: invalid supplier ODS code")
83+
84+
if not is_valid_datetime(timestamp):
85+
raise InvalidFileKeyError("Initial file validation failed: invalid timestamp")
86+
87+
if extension not in ["CSV", "DAT"]:
88+
raise InvalidFileKeyError("Initial file validation failed: unsupported file extension")
6989

7090
return vaccine_type, supplier
7191

filenameprocessor/tests/test_file_key_validation.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ def test_validate_file_key(self, mock_hkeys, mock_hget):
6565
(VALID_FLU_EMIS_FILE_KEY.upper(), "YGM41", ("FLU", "EMIS")),
6666
# Valid RSV/ RAVS file key
6767
(VALID_RSV_RAVS_FILE_KEY, "X8E5B", ("RSV", "RAVS")),
68+
# Valid file key with checksum
69+
("RSV_Vaccinations_v5_X8E5B_20000101T00000001_qZYjV+omQMU=.csv", "X8E5B", ("RSV", "RAVS")),
6870
]
6971

7072
for file_key, ods_code, expected_result in test_cases_for_success_scenarios:
@@ -73,43 +75,41 @@ def test_validate_file_key(self, mock_hkeys, mock_hget):
7375
mock_hkeys.assert_called_with("vacc_to_diseases")
7476
mock_hget.assert_called_with("ods_code_to_supplier", ods_code)
7577

76-
key_format_error_message = "Initial file validation failed: invalid file key format"
77-
invalid_file_key_error_message = "Initial file validation failed: invalid file key"
7878
test_cases_for_failure_scenarios = [
7979
# File key with no '.'
80-
(VALID_FLU_EMIS_FILE_KEY.replace(".", ""), key_format_error_message),
81-
# File key with additional '.'
82-
(VALID_FLU_EMIS_FILE_KEY[:2] + "." + VALID_FLU_EMIS_FILE_KEY[2:], key_format_error_message),
83-
# File key with additional '_'
84-
(VALID_FLU_EMIS_FILE_KEY[:2] + "_" + VALID_FLU_EMIS_FILE_KEY[2:], key_format_error_message),
80+
(VALID_FLU_EMIS_FILE_KEY.replace(".", ""), "Initial file validation failed: missing file extension"),
81+
# File key with additional '.' in vaccine type
82+
(VALID_FLU_EMIS_FILE_KEY[:2] + "." + VALID_FLU_EMIS_FILE_KEY[2:],"Initial file validation failed: invalid vaccine type"),
83+
# File key with additional '_' in vaccine type
84+
(VALID_FLU_EMIS_FILE_KEY[:2] + "_" + VALID_FLU_EMIS_FILE_KEY[2:],"Initial file validation failed: invalid vaccine type"),
8585
# File key with missing '_'
86-
(VALID_FLU_EMIS_FILE_KEY.replace("_", "", 1), key_format_error_message),
86+
(VALID_FLU_EMIS_FILE_KEY.replace("_", "", 1), "Initial file validation failed: not enough parts in file key"),
8787
# File key with missing '_'
88-
(VALID_FLU_EMIS_FILE_KEY.replace("_", ""), key_format_error_message),
88+
(VALID_FLU_EMIS_FILE_KEY.replace("_", ""), "Initial file validation failed: not enough parts in file key"),
8989
# File key with missing extension
90-
(VALID_FLU_EMIS_FILE_KEY.replace(".csv", ""), key_format_error_message),
90+
(VALID_FLU_EMIS_FILE_KEY.replace(".csv", ""), "Initial file validation failed: missing file extension"),
9191
# File key with invalid vaccine type
92-
(VALID_FLU_EMIS_FILE_KEY.replace("FLU", "Flue"), invalid_file_key_error_message),
92+
(VALID_FLU_EMIS_FILE_KEY.replace("FLU", "Flue"), "Initial file validation failed: invalid vaccine type"),
9393
# File key with missing vaccine type
94-
(VALID_FLU_EMIS_FILE_KEY.replace("FLU", ""), invalid_file_key_error_message),
94+
(VALID_FLU_EMIS_FILE_KEY.replace("FLU", ""), "Initial file validation failed: invalid vaccine type"),
9595
# File key with invalid vaccinations element
96-
(VALID_FLU_EMIS_FILE_KEY.replace("Vaccinations", "Vaccination"), invalid_file_key_error_message),
96+
(VALID_FLU_EMIS_FILE_KEY.replace("Vaccinations", "Vaccination"),"Initial file validation failed: file key must contain VACCINATIONS"),
9797
# File key with missing vaccinations element
98-
(VALID_FLU_EMIS_FILE_KEY.replace("Vaccinations", ""), invalid_file_key_error_message),
98+
(VALID_FLU_EMIS_FILE_KEY.replace("Vaccinations", ""), "Initial file validation failed: file key must contain VACCINATIONS"),
9999
# File key with invalid version
100-
(VALID_FLU_EMIS_FILE_KEY.replace("v5", "v4"), invalid_file_key_error_message),
100+
(VALID_FLU_EMIS_FILE_KEY.replace("v5", "v4"), "Initial file validation failed: invalid file version"),
101101
# File key with missing version
102-
(VALID_FLU_EMIS_FILE_KEY.replace("v5", ""), invalid_file_key_error_message),
102+
(VALID_FLU_EMIS_FILE_KEY.replace("v5", ""), "Initial file validation failed: invalid file version"),
103103
# File key with invalid ODS code
104-
(VALID_FLU_EMIS_FILE_KEY.replace("YGM41", "YGAM"), invalid_file_key_error_message),
104+
(VALID_FLU_EMIS_FILE_KEY.replace("YGM41", "YGAM"), "Initial file validation failed: invalid supplier ODS code"),
105105
# File key with missing ODS code
106-
(VALID_FLU_EMIS_FILE_KEY.replace("YGM41", ""), invalid_file_key_error_message),
106+
(VALID_FLU_EMIS_FILE_KEY.replace("YGM41", ""), "Initial file validation failed: invalid supplier ODS code"),
107107
# File key with invalid timestamp
108-
(VALID_FLU_EMIS_FILE_KEY.replace("20000101T00000001", "20200132T12345600"), invalid_file_key_error_message),
108+
(VALID_FLU_EMIS_FILE_KEY.replace("20000101T00000001", "20200132T12345600"),"Initial file validation failed: invalid timestamp"),
109109
# File key with missing timestamp
110-
(VALID_FLU_EMIS_FILE_KEY.replace("20000101T00000001", ""), invalid_file_key_error_message),
110+
(VALID_FLU_EMIS_FILE_KEY.replace("20000101T00000001", ""),"Initial file validation failed: invalid timestamp"),
111111
# File key with incorrect extension
112-
(VALID_FLU_EMIS_FILE_KEY.replace(".csv", ".xlsx"), invalid_file_key_error_message),
112+
(VALID_FLU_EMIS_FILE_KEY.replace(".csv", ".xlsx"), "Initial file validation failed: unsupported file extension"),
113113
]
114114

115115
for file_key, expected_result in test_cases_for_failure_scenarios:

mesh_processor/src/converter.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,13 @@ def move_file(source_bucket: str, source_key: str, destination_bucket: str, dest
8585
s3_client.delete_object(Bucket=source_bucket, Key=source_key)
8686

8787

88-
def transfer_multipart_content(bucket_name: str, file_key: str, boundary: bytes, filename: str) -> None:
88+
def transfer_multipart_content(
89+
bucket_name: str,
90+
file_key: str,
91+
boundary: bytes,
92+
filename: str,
93+
checksum: str
94+
) -> None:
8995
with open(
9096
f"s3://{bucket_name}/{file_key}",
9197
"rb",
@@ -98,6 +104,7 @@ def transfer_multipart_content(bucket_name: str, file_key: str, boundary: bytes,
98104
if content_disposition:
99105
_, content_disposition_params = parse_header_value(content_disposition)
100106
filename = content_disposition_params.get("filename") or filename
107+
filename = add_checksum_to_filename(filename, checksum)
101108
content_type = headers.get("Content-Type") or "application/octet-stream"
102109

103110
with open(
@@ -117,26 +124,54 @@ def transfer_multipart_content(bucket_name: str, file_key: str, boundary: bytes,
117124
move_file(DESTINATION_BUCKET_NAME, f"streaming/{filename}", DESTINATION_BUCKET_NAME, filename)
118125

119126

127+
def get_checksum_value(checksum_obj: dict[str, str]) -> str:
128+
return (
129+
checksum_obj.get("ChecksumCRC64NVME")
130+
or checksum_obj.get("ChecksumCRC32")
131+
or checksum_obj.get("ChecksumCRC32C")
132+
or checksum_obj.get("ChecksumSHA1")
133+
or checksum_obj.get("ChecksumSHA256")
134+
or checksum_obj.get("ChecksumMD5")
135+
)
136+
137+
138+
def add_checksum_to_filename(filename: str, checksum: str) -> str:
139+
filename_parts = filename.rsplit(".", 1)
140+
return (
141+
f"{filename_parts[0]}_{checksum}.{filename_parts[1]}"
142+
if len(filename_parts) > 1
143+
else f"{filename}_{checksum}"
144+
)
145+
146+
120147
def process_record(record: dict) -> None:
121148
bucket_name = record["s3"]["bucket"]["name"]
122149
file_key = record["s3"]["object"]["key"]
123150
logger.info(f"Processing {file_key}")
124151

125-
response = s3_client.head_object(Bucket=bucket_name, Key=file_key)
126-
content_type = response['ContentType']
152+
head_object_response = s3_client.head_object(Bucket=bucket_name, Key=file_key)
153+
content_type = head_object_response['ContentType']
127154
media_type, content_type_params = parse_header_value(content_type)
128-
filename = response["Metadata"].get("mex-filename") or file_key
155+
filename = head_object_response["Metadata"].get("mex-filename") or file_key
156+
157+
get_object_attributes_response = s3_client.get_object_attributes(
158+
Bucket=bucket_name,
159+
Key=file_key,
160+
ObjectAttributes=["Checksum"]
161+
)
162+
checksum_obj = get_object_attributes_response["Checksum"]
163+
checksum = get_checksum_value(checksum_obj)
129164

130165
# Handle multipart content by parsing the filename from headers and streaming the content from the first part
131166
if media_type.startswith("multipart/"):
132167
logger.info("Found multipart content")
133168
boundary = content_type_params["boundary"].encode("utf-8")
134-
transfer_multipart_content(bucket_name, file_key, boundary, filename)
169+
transfer_multipart_content(bucket_name, file_key, boundary, filename, checksum)
135170
else:
136171
s3_client.copy_object(
137172
Bucket=DESTINATION_BUCKET_NAME,
138173
CopySource={"Bucket": bucket_name, "Key": file_key},
139-
Key=filename
174+
Key=add_checksum_to_filename(filename, checksum),
140175
)
141176

142177
logger.info(f"Transfer complete for {file_key}")

mesh_processor/tests/test_converter.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def test_non_multipart_content_type(self):
4848
result = invoke_lambda("test-csv-file.csv")
4949
self.assertEqual(result["statusCode"], 200)
5050

51-
get_target_response = s3.get_object(Bucket="destination-bucket", Key="overridden-filename.csv")
51+
get_target_response = s3.get_object(Bucket="destination-bucket", Key="overridden-filename_undw3w==.csv")
5252
body = get_target_response["Body"].read().decode("utf-8")
5353
assert body == "some CSV content"
5454

@@ -71,7 +71,7 @@ def test_non_multipart_content_type_without_mesh_metadata(self):
7171
result = invoke_lambda("test-csv-file.csv")
7272
self.assertEqual(result["statusCode"], 200)
7373

74-
response = s3.get_object(Bucket="destination-bucket", Key="test-csv-file.csv")
74+
response = s3.get_object(Bucket="destination-bucket", Key="test-csv-file_undw3w==.csv")
7575
body = response["Body"].read().decode("utf-8")
7676
assert body == "some CSV content"
7777

@@ -88,7 +88,8 @@ def test_multipart_content_type(self):
8888
"some CSV content",
8989
"--12345678--",
9090
""
91-
])
91+
]),
92+
"test-csv-file_8aia6A==.csv"
9293
),
9394
(
9495
"missing initial newline",
@@ -100,7 +101,8 @@ def test_multipart_content_type(self):
100101
"some CSV content",
101102
"--12345678--",
102103
""
103-
])
104+
]),
105+
"test-csv-file_D2bxXQ==.csv"
104106
),
105107
(
106108
"missing final newline",
@@ -112,7 +114,8 @@ def test_multipart_content_type(self):
112114
"",
113115
"some CSV content",
114116
"--12345678--",
115-
])
117+
]),
118+
"test-csv-file_waYOrA==.csv"
116119
),
117120
(
118121
"multiple parts",
@@ -130,11 +133,12 @@ def test_multipart_content_type(self):
130133
"some ignored content",
131134
"--12345678--",
132135
""
133-
])
136+
]),
137+
"test-csv-file_7zR2dg==.csv"
134138
)
135139
]
136-
for msg, body in cases:
137-
with self.subTest(msg=msg, body=body):
140+
for msg, body, filename in cases:
141+
with self.subTest(msg=msg, body=body, filename=filename):
138142
s3 = boto3.client("s3", region_name="eu-west-2")
139143
s3.put_object(
140144
Bucket="source-bucket",
@@ -146,7 +150,7 @@ def test_multipart_content_type(self):
146150
result = invoke_lambda("test-dat-file.dat")
147151
self.assertEqual(result["statusCode"], 200)
148152

149-
response = s3.get_object(Bucket="destination-bucket", Key="test-csv-file.csv")
153+
response = s3.get_object(Bucket="destination-bucket", Key=filename)
150154
body = response["Body"].read().decode("utf-8")
151155
assert body == "some CSV content"
152156
content_type = response["ContentType"]
@@ -165,7 +169,8 @@ def test_multipart_content_type_without_filename_in_headers(self):
165169
"some CSV content",
166170
"--12345678--",
167171
""
168-
])
172+
]),
173+
"test-dat-file_r67kLQ==.dat"
169174
),
170175
(
171176
"no header",
@@ -176,11 +181,12 @@ def test_multipart_content_type_without_filename_in_headers(self):
176181
"some CSV content",
177182
"--12345678--",
178183
""
179-
])
184+
]),
185+
"test-dat-file_VPfBaQ==.dat"
180186
)
181187
]
182-
for msg, body in cases:
183-
with self.subTest(msg=msg, body=body):
188+
for msg, body, filename in cases:
189+
with self.subTest(msg=msg, body=body, filename=filename):
184190
s3 = boto3.client("s3", region_name="eu-west-2")
185191
s3.put_object(
186192
Bucket="source-bucket",
@@ -192,7 +198,7 @@ def test_multipart_content_type_without_filename_in_headers(self):
192198
result = invoke_lambda("test-dat-file.dat")
193199
self.assertEqual(result["statusCode"], 200)
194200

195-
response = s3.get_object(Bucket="destination-bucket", Key="test-dat-file.dat")
201+
response = s3.get_object(Bucket="destination-bucket", Key=filename)
196202
body = response["Body"].read().decode("utf-8")
197203
assert body == "some CSV content"
198204

@@ -217,7 +223,7 @@ def test_multipart_content_type_without_content_type_in_headers(self):
217223
result = invoke_lambda("test-dat-file.dat")
218224
self.assertEqual(result["statusCode"], 200)
219225

220-
response = s3.get_object(Bucket="destination-bucket", Key="test-csv-file.csv")
226+
response = s3.get_object(Bucket="destination-bucket", Key="test-csv-file_dfUskg==.csv")
221227
body = response["Body"].read().decode("utf-8")
222228
assert body == "some CSV content"
223229
content_type = response["ContentType"]
@@ -245,7 +251,7 @@ def test_multipart_content_type_with_unix_line_endings(self):
245251
result = invoke_lambda("test-dat-file.dat")
246252
self.assertEqual(result["statusCode"], 200)
247253

248-
response = s3.get_object(Bucket="destination-bucket", Key="test-csv-file.csv")
254+
response = s3.get_object(Bucket="destination-bucket", Key="test-csv-file_HVspqQ==.csv")
249255
body = response["Body"].read().decode("utf-8")
250256
assert body == "some CSV content\nsplit across\nmultiple lines"
251257

@@ -267,6 +273,5 @@ def test_unexpected_end_of_file(self):
267273
result = invoke_lambda("test-dat-file.dat")
268274
self.assertEqual(result["statusCode"], 500)
269275

270-
with self.assertRaises(ClientError) as e:
271-
s3.head_object(Bucket="destination-bucket", Key="test-csv-file.csv")
272-
self.assertEqual(e.exception.response["Error"]["Code"], "404")
276+
list_objects_response = s3.list_objects_v2(Bucket="destination-bucket")
277+
self.assertEqual(list_objects_response["KeyCount"], 0)

0 commit comments

Comments
 (0)