From 5bd865a227bd7ad376f165ca8f7701a20d21a832 Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Mon, 22 Dec 2025 12:58:24 +0100 Subject: [PATCH] Split attachment integrity slices by attachment sizes --- .../remotesettings/attachments_integrity.py | 36 ++++++++-- .../test_attachments_integrity.py | 68 ++++++++++++++++--- 2 files changed, 89 insertions(+), 15 deletions(-) diff --git a/checks/remotesettings/attachments_integrity.py b/checks/remotesettings/attachments_integrity.py index 3cb8fa6e..97bbcc65 100644 --- a/checks/remotesettings/attachments_integrity.py +++ b/checks/remotesettings/attachments_integrity.py @@ -6,7 +6,6 @@ import asyncio import logging -import math import aiohttp @@ -68,7 +67,8 @@ async def run(server: str, slice_percent: tuple[int, int] = (0, 100)) -> CheckRe # For each record that has an attachment, check the attachment content. attachments = [] - for entry, changeset in zip(entries, results): + total_size = 0 + for changeset in results: records = changeset["changes"] for record in records: if "attachment" not in record: @@ -76,9 +76,35 @@ async def run(server: str, slice_percent: tuple[int, int] = (0, 100)) -> CheckRe attachment = record["attachment"] attachment["location"] = base_url + attachment["location"] attachments.append(attachment) - - lower_idx = math.floor(slice_percent[0] / 100.0 * len(attachments)) - upper_idx = math.ceil(slice_percent[1] / 100.0 * len(attachments)) + total_size += attachment["size"] + + # Spread the load of attachments integrity check based on size. + # Otherwise some slices may take much longer to complete than others. + + # We sort by attachment size descending. It's not mandatory, but + # it gives us more observability of what slices are taking longer + # to complete (big files or lots of small files). + attachments.sort(key=lambda att: att["size"], reverse=True) + # Compute the slice using percent of total size. + slice_lower = (slice_percent[0] / 100.0) * total_size + slice_upper = (slice_percent[1] / 100.0) * total_size + lower_idx = 0 + accumulated_size = 0 + for attachment in attachments: + if accumulated_size >= slice_lower: + break + accumulated_size += attachment["size"] + lower_idx += 1 + upper_idx = lower_idx + for attachment in attachments[lower_idx:]: + accumulated_size += attachment["size"] + upper_idx += 1 + if accumulated_size >= slice_upper: + break + print( + f"Total size {total_size} Slice {slice_percent} => size {slice_lower:.0f}-{slice_upper:.0f} bytes" + ) + print(f"Attachments slice indexes: {lower_idx}-{upper_idx} of {len(attachments)}") sliced = attachments[lower_idx:upper_idx] futures = [test_attachment(attachment) for attachment in sliced] diff --git a/tests/checks/remotesettings/test_attachments_integrity.py b/tests/checks/remotesettings/test_attachments_integrity.py index 67ca0e30..5e1cec2e 100644 --- a/tests/checks/remotesettings/test_attachments_integrity.py +++ b/tests/checks/remotesettings/test_attachments_integrity.py @@ -86,7 +86,7 @@ async def test_negative(mock_aioresponses, no_sleep): "location": "file1.jpg", }, }, - {"id": "efg", "attachment": {"location": "missing.jpg"}}, + {"id": "efg", "attachment": {"location": "missing.jpg", "size": 7}}, { "id": "ijk", "attachment": {"size": 10, "hash": "foo", "location": "file2.jpg"}, @@ -113,20 +113,20 @@ async def test_negative(mock_aioresponses, no_sleep): assert data == { "bad": [ { - "error": "size differ (5!=7)", - "url": "http://cdn/file1.jpg", - }, - { - "error": "Connection refused: GET http://cdn/missing.jpg", - "url": "http://cdn/missing.jpg", + "error": "timeout", + "url": "http://cdn/file3.jpg", }, { "error": "hash differ (bf2cb58a68f684d95a3b78ef8f661c9a4e5b09e82cc8f9cc88cce90528caeb27!=foo)", "url": "http://cdn/file2.jpg", }, { - "error": "timeout", - "url": "http://cdn/file3.jpg", + "error": "size differ (5!=7)", + "url": "http://cdn/file1.jpg", + }, + { + "error": "Connection refused: GET http://cdn/missing.jpg", + "url": "http://cdn/missing.jpg", }, ], "checked": 4, @@ -169,7 +169,7 @@ async def test_urls_slicing( records_url, payload={ "changes": [ - {"id": f"id{i}", "attachment": {"location": f"file{i}.jpg"}} + {"id": f"id{i}", "attachment": {"location": f"file{i}.jpg", "size": 10}} for i in range(100) ] }, @@ -183,3 +183,51 @@ async def test_urls_slicing( calls = mocked.call_args_list assert calls[0][0][0]["location"] == f"http://cdn/file{expected_lower}.jpg" assert calls[-1][0][0]["location"] == f"http://cdn/file{expected_upper}.jpg" + + +async def test_urls_slicing_by_size(mock_aioresponses): + server_url = "http://fake.local/v1" + mock_aioresponses.get( + server_url + "/", + payload={"capabilities": {"attachments": {"base_url": "http://cdn/"}}}, + ) + changes_url = server_url + CHANGESET_URL.format("monitor", "changes") + mock_aioresponses.get( + changes_url, + payload={ + "changes": [ + {"id": "abc", "bucket": "bid", "collection": "cid", "last_modified": 42} + ] + }, + ) + records_url = server_url + CHANGESET_URL.format("bid", "cid") + "?_expected=42" + mock_aioresponses.get( + records_url, + payload={ + "changes": [ + {"id": "id-0", "attachment": {"location": "file-big.jpg", "size": 100}}, + { + "id": "id-1", + "attachment": {"location": "file-small1.jpg", "size": 10}, + }, + { + "id": "id-2", + "attachment": {"location": "file-small2.jpg", "size": 10}, + }, + { + "id": "id-3", + "attachment": {"location": "file-small3.jpg", "size": 10}, + }, + ] + }, + ) + + with mock.patch( + "checks.remotesettings.attachments_integrity.test_attachment" + ) as mocked: + mocked.return_value = {}, True + await run(server_url, slice_percent=(0, 50)) + calls = mocked.call_args_list + # Only the big file should be in the first 50% slice. + assert len(calls) == 1 + assert calls[0][0][0]["location"] == "http://cdn/file-big.jpg"