Skip to content

Commit 0a2edea

Browse files
sir-sigurdCopilot
andauthored
Index 2026+ package pointers (#4683)
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 2abb847 commit 0a2edea

File tree

6 files changed

+97
-62
lines changed

6 files changed

+97
-62
lines changed

lambdas/indexer/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ where verb is one of
1717

1818
## Changes
1919

20+
- [Fixed] Process package pointers from year 2026+ ([#4683](https://github.com/quiltdata/quilt/pull/4683))
2021
- [Changed] Switch to uv ([#4616](https://github.com/quiltdata/quilt/pull/4616))
2122
- [Changed] Upgrade to Python 3.12 ([#4616](https://github.com/quiltdata/quilt/pull/4616))
2223
- [Changed] Manifest handling moved to `manifest_indexer` lambda ([#4422](https://github.com/quiltdata/quilt/pull/4422))

lambdas/indexer/src/t4_lambda_es_indexer/index.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import os
5252
import pathlib
5353
import re
54+
import time
5455
from os.path import split
5556
from typing import Optional, Tuple
5657
from urllib.parse import unquote_plus
@@ -301,9 +302,11 @@ def index_if_pointer(
301302
return False
302303
try:
303304
manifest_timestamp = int(pointer_file)
304-
if not 1451631600 <= manifest_timestamp <= 1767250800:
305+
if manifest_timestamp < 1451631600:
305306
logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key)
306307
return False
308+
if manifest_timestamp > time.time():
309+
logger_.warning("Manifest timestamp s3://%s/%s is in the future", bucket, key)
307310
except ValueError as err:
308311
logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err)
309312

lambdas/indexer/test/test_index.py

Lines changed: 72 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,13 @@
127127
}
128128

129129

130+
@pytest.fixture
131+
def s3_client(mocker):
132+
client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
133+
mocker.patch("t4_lambda_es_indexer.index.make_s3_client", return_value=client)
134+
return client
135+
136+
130137
def _check_event(synthetic, organic):
131138
# Ensure that synthetic events have the same shape as actual organic ones,
132139
# and that overridden properties like bucket, key, eTag are properly set
@@ -913,64 +920,10 @@ def test_index_if_pointer_not_exists(self, append_mock, es_mock):
913920
)
914921
append_mock.assert_not_called()
915922

916-
@patch.object(index, "es")
917-
@patch.object(index.DocumentQueue, 'append_document')
918-
def test_index_if_pointer(self, append_mock, es_mock):
919-
bucket = "quilt-example"
920-
handle = "author/semantic"
921-
pointer_file = "1610412903"
922-
key = f"{NAMED_PACKAGES_PREFIX}{handle}/{pointer_file}"
923-
pkg_hash = "a" * 64
924-
last_modified = datetime.datetime(2021, 1, 1, 0, 0, tzinfo=tzutc())
925-
926-
self.s3_stubber.add_response(
927-
method="get_object",
928-
expected_params={
929-
"Bucket": bucket,
930-
"Key": key,
931-
},
932-
service_response={
933-
"Body": BytesIO(pkg_hash.encode()),
934-
"LastModified": last_modified,
935-
},
936-
)
937-
938-
index.index_if_pointer(
939-
self.s3_client,
940-
index.DocumentQueue(None),
941-
bucket=bucket,
942-
key=key,
943-
)
944-
945-
es_mock.delete_by_query.assert_called_once_with(
946-
index=bucket + PACKAGE_INDEX_SUFFIX,
947-
body={
948-
"query": {
949-
"bool": {
950-
"filter": [{"term": {"_id": get_ptr_doc_id("author/semantic", "1610412903")}}],
951-
"must_not": [{"term": {"join_field#mnfst": get_manifest_doc_id("a" * 64)}}],
952-
}
953-
}
954-
},
955-
)
956-
append_mock.assert_called_once_with({
957-
"_index": bucket + PACKAGE_INDEX_SUFFIX,
958-
"_op_type": "index",
959-
"_id": get_ptr_doc_id(handle, pointer_file),
960-
"join_field": {
961-
"name": "ptr",
962-
"parent": get_manifest_doc_id(pkg_hash),
963-
},
964-
"routing": get_manifest_doc_id(pkg_hash),
965-
"ptr_name": handle,
966-
"ptr_tag": pointer_file,
967-
"ptr_last_modified": last_modified,
968-
})
969-
970923
def test_index_if_pointer_skip(self):
971924
"""test cases where index_if_pointer ignores input for different reasons"""
972925
# none of these should index due to out-of-range timestamp or non-integer name
973-
for file_name in [1451631500, 1767250801]:
926+
for file_name in [1451631500]:
974927
key = f".quilt/named_packages/foo/bar/{file_name}"
975928
assert not index.index_if_pointer(
976929
self.s3_client,
@@ -1695,6 +1648,70 @@ def test_get_object_tagging_no_such_key(self):
16951648
)
16961649

16971650

1651+
@pytest.mark.parametrize(
1652+
"pointer_file",
1653+
[
1654+
"1610412903",
1655+
"9999999999",
1656+
],
1657+
)
1658+
@patch("t4_lambda_es_indexer.index.es")
1659+
@patch("t4_lambda_es_indexer.document_queue.DocumentQueue.append_document")
1660+
def test_index_if_pointer(append_mock, es_mock, s3_client, pointer_file):
1661+
bucket = "quilt-example"
1662+
handle = "author/semantic"
1663+
key = f"{NAMED_PACKAGES_PREFIX}{handle}/{pointer_file}"
1664+
pkg_hash = "a" * 64
1665+
last_modified = datetime.datetime(2021, 1, 1, 0, 0, tzinfo=tzutc())
1666+
1667+
with Stubber(s3_client) as s3_stubber:
1668+
s3_stubber.add_response(
1669+
method="get_object",
1670+
expected_params={
1671+
"Bucket": bucket,
1672+
"Key": key,
1673+
},
1674+
service_response={
1675+
"Body": BytesIO(pkg_hash.encode()),
1676+
"LastModified": last_modified,
1677+
},
1678+
)
1679+
1680+
index.index_if_pointer(
1681+
s3_client,
1682+
index.DocumentQueue(None),
1683+
bucket=bucket,
1684+
key=key,
1685+
)
1686+
1687+
es_mock.delete_by_query.assert_called_once_with(
1688+
index=bucket + PACKAGE_INDEX_SUFFIX,
1689+
body={
1690+
"query": {
1691+
"bool": {
1692+
"filter": [{"term": {"_id": get_ptr_doc_id("author/semantic", pointer_file)}}],
1693+
"must_not": [{"term": {"join_field#mnfst": get_manifest_doc_id("a" * 64)}}],
1694+
}
1695+
}
1696+
},
1697+
)
1698+
append_mock.assert_called_once_with({
1699+
"_index": bucket + PACKAGE_INDEX_SUFFIX,
1700+
"_op_type": "index",
1701+
"_id": get_ptr_doc_id(handle, pointer_file),
1702+
"join_field": {
1703+
"name": "ptr",
1704+
"parent": get_manifest_doc_id(pkg_hash),
1705+
},
1706+
"routing": get_manifest_doc_id(pkg_hash),
1707+
"ptr_name": handle,
1708+
"ptr_tag": pointer_file,
1709+
"ptr_last_modified": last_modified,
1710+
})
1711+
1712+
s3_stubber.assert_no_pending_responses()
1713+
1714+
16981715
def test_extract_pptx():
16991716
lorem = "Lorem ipsum dolor sit amet, consectetur"
17001717

lambdas/pkgevents/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ where verb is one of
1717

1818
## Changes
1919

20+
- [Fixed] Process package pointers from year 2026+ ([#4683](https://github.com/quiltdata/quilt/pull/4683))
2021
- [Changed] Migrate to proper package structure ([#4647](https://github.com/quiltdata/quilt/pull/4647))
2122
- [Changed] Switch to uv ([#4647](https://github.com/quiltdata/quilt/pull/4647))
2223
- [Changed] Upgrade to Python 3.13 ([#4647](https://github.com/quiltdata/quilt/pull/4647))

lambdas/pkgevents/src/t4_lambda_pkgevents/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import itertools
22
import json
33
import re
4+
import time
45

56
import boto3
67

@@ -53,16 +54,19 @@ def pkg_created_event(s3_event):
5354
if not s3_event['eventName'].startswith('ObjectCreated:'):
5455
return
5556
s3_event_obj = s3_event['s3']
57+
bucket = s3_event_obj["bucket"]["name"]
5658
obj = s3_event_obj['object']
5759
key = obj['key']
5860
match = PKG_POINTER_REGEX.fullmatch(key)
5961
if not match:
6062
return
6163
pkg_name, pointer_name = match.groups()
62-
if not '1451631600' <= pointer_name <= '1767250800':
64+
pointer_timestamp = int(pointer_name)
65+
if pointer_timestamp < 1451631600:
66+
logger.warning("pointer %r in bucket %r at %r is too old, skipping", pointer_name, bucket, key)
6367
return
64-
bucket_obj = s3_event_obj['bucket']
65-
bucket = bucket_obj['name']
68+
if pointer_timestamp > time.time():
69+
logger.warning("pointer %r in bucket %r at %r is in the future", pointer_name, bucket, key)
6670
try:
6771
resp = s3.get_object(Bucket=bucket, Key=key, Range=f'bytes=0-{EXPECTED_POINTER_SIZE - 1}')
6872
except s3.exceptions.NoSuchKey:

lambdas/pkgevents/tests/test_index.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
'.quilt/named_packages/a/b/1451631599',
3232
'.quilt/named_packages/a/b/1451631600/',
3333
'.quilt/named_packages/a/b/1767250800/',
34-
'.quilt/named_packages/a/b/1767250801',
3534
'.quilt/named_packages//b/1451631600',
3635
'.quilt/named_packages/a//1451631600',
3736
'.quilt/named_packages/a/b/145163160߀',
@@ -46,15 +45,25 @@ def test_pkg_created_event_bad_key(key):
4645
'object': {
4746
'key': key,
4847
},
48+
"bucket": {
49+
"name": "test-bucket",
50+
},
4951
},
5052
}
5153
) is None
5254

5355

54-
def test_pkg_created_event():
56+
@pytest.mark.parametrize(
57+
"pointer_file",
58+
(
59+
"1451631600",
60+
"9999999999",
61+
),
62+
)
63+
def test_pkg_created_event(pointer_file):
5564
bucket_name = 'test-bucket'
5665
handle = 'a/b'
57-
key = f'.quilt/named_packages/{handle}/1451631600'
66+
key = f".quilt/named_packages/{handle}/{pointer_file}"
5867
event_time = '2021-03-11T14:29:19.277067Z'
5968
top_hash = b'a' * 64
6069
event = {

0 commit comments

Comments
 (0)