Skip to content

Commit 78b9da6

Browse files
committed
Merge remote-tracking branch 'origin/master' into issue1565-extract-extent-from-constraints-with-caching
2 parents b69a22f + 3c25d14 commit 78b9da6

File tree

5 files changed

+158
-25
lines changed

5 files changed

+158
-25
lines changed

openeogeotrellis/_backend/post_dry_run.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -346,20 +346,50 @@ def _extract_spatial_extent_from_load_stac_item_collection(
346346
for item, band_assets in item_collection.iter_items_with_band_assets()
347347
for asset in band_assets.values()
348348
]
349-
_log.info(f"Collected {len(item_collection.items)} items, {len(projection_metadatas)} projection metadata entries")
349+
350+
# Deduplicate: assets with identical projection properties produce identical
351+
# bbox/coverage/resolution results. E.g. Sentinel-2 items have ~43 assets per item
352+
# that often share the same proj:bbox.
353+
seen_keys = set()
354+
unique_projection_metadatas = []
355+
for p in projection_metadatas:
356+
key = (p._code, p._bbox, p._shape, p._transform)
357+
if key not in seen_keys:
358+
seen_keys.add(key)
359+
unique_projection_metadatas.append(p)
360+
_log.info(
361+
f"Collected {len(projection_metadatas)} projection metadata entries"
362+
f" from {len(item_collection.items)} items,"
363+
f" deduplicated to {len(unique_projection_metadatas)} unique entries"
364+
)
350365

351366
# Determine most common grid (CRS and resolution) among assets
352-
target_grid = _determine_best_grid_from_proj_metadata(projection_metadatas)
367+
target_grid = _determine_best_grid_from_proj_metadata(unique_projection_metadatas)
353368
target_crs = target_grid.crs_raw if target_grid else None
354369
_log.info(f"Determined {target_grid=}")
355370

356-
# Merge asset bounding boxes (full native extent, and "aligned" part of covered extent)
371+
# Merge asset bounding boxes (full native extent, and "aligned" part of covered extent).
372+
# Batch by CRS: compute min/max within each CRS group to avoid
373+
# per-element reprojection overhead in the merger.
357374
assets_full_bbox_merger = BoundingBoxMerger(crs=target_crs)
358-
aligned_extent_coverage_merger = BoundingBoxMerger(crs=target_crs)
359-
for proj_metadata in projection_metadatas:
375+
crs_bbox_groups: Dict[Union[str, None], List[BoundingBox]] = collections.defaultdict(list)
376+
for proj_metadata in unique_projection_metadatas:
360377
if asset_bbox := proj_metadata.to_bounding_box():
361-
assets_full_bbox_merger.add(asset_bbox)
362-
if extent_orig and (extent_coverage := proj_metadata.coverage_for(extent_orig)):
378+
crs_bbox_groups[asset_bbox.crs].append(asset_bbox)
379+
for crs, bboxes in crs_bbox_groups.items():
380+
merged_group = BoundingBox(
381+
west=min(b.west for b in bboxes),
382+
south=min(b.south for b in bboxes),
383+
east=max(b.east for b in bboxes),
384+
north=max(b.north for b in bboxes),
385+
crs=crs,
386+
)
387+
assets_full_bbox_merger.add(merged_group)
388+
389+
aligned_extent_coverage_merger = BoundingBoxMerger(crs=target_crs)
390+
if extent_orig:
391+
for proj_metadata in unique_projection_metadatas:
392+
if extent_coverage := proj_metadata.coverage_for(extent_orig):
363393
aligned_extent_coverage_merger.add(extent_coverage)
364394
assets_full_bbox = assets_full_bbox_merger.get()
365395
assets_covered_bbox = aligned_extent_coverage_merger.get()

openeogeotrellis/integrations/stac.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def read_text_from_href(self, href: str) -> str:
5757

5858
class LoggingStacApiIO(StacApiIO):
5959
"""
60-
StacApiIO subclass that logs every HTTP request at INFO level,
60+
StacApiIO subclass that logs every HTTP request,
6161
including the method, URL, response status code, and elapsed time.
6262
6363
Uses a requests response hook so every request made through the session

openeogeotrellis/load_stac.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import functools
77
import logging
88
import os
9+
import random
910
import re
1011
import time
1112
from dataclasses import dataclass
@@ -64,6 +65,26 @@
6465
STAC_API_PER_PAGE_LIMIT_DEFAULT = 100
6566

6667

68+
class _JitteredRetry(Retry):
69+
"""Retry with jitter to avoid thundering herd on 429 responses.
70+
71+
- No Retry-After header: full jitter (random in [0, base_backoff])
72+
- Retry-After header present: respects it as a minimum with full jitter
73+
"""
74+
75+
def get_backoff_time(self) -> float:
76+
base = super().get_backoff_time()
77+
return random.uniform(0, base)
78+
79+
def sleep_for_retry(self, response=None) -> bool:
80+
retry_after = self.get_retry_after(response)
81+
if retry_after is not None:
82+
jitter = random.uniform(0, super().get_backoff_time())
83+
time.sleep(retry_after + jitter)
84+
return True
85+
return False
86+
87+
6788
class NoDataAvailableException(OpenEOApiException):
6889
status_code = 400
6990
code = "NoDataAvailable"
@@ -1236,8 +1257,8 @@ def from_stac_api(
12361257
# https://stac.openeo.vito.be/ and https://stac.terrascope.be
12371258
fields = None
12381259

1239-
retry = requests.adapters.Retry(
1240-
total=4,
1260+
retry = _JitteredRetry(
1261+
total=7,
12411262
backoff_factor=2,
12421263
status_forcelist=frozenset([429, 500, 502, 503, 504]),
12431264
allowed_methods=Retry.DEFAULT_ALLOWED_METHODS.union({"POST"}),

tests/conftest.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import shutil
66
import sys
77
import typing
8+
import urllib.parse
89
from datetime import datetime
910
from glob import glob
1011
from pathlib import Path
@@ -452,6 +453,8 @@ class UrllibAndRequestMocker:
452453
def __init__(self, urllib_mock, requests_mock):
453454
self.urllib_mock = urllib_mock
454455
self.requests_mock = requests_mock
456+
self._flexible_responses: dict[str, dict[tuple, bytes]] = {}
457+
self._flexible_ignore: dict[str, frozenset] = {}
455458

456459
def get(self, href, data):
457460
code = 200
@@ -460,6 +463,71 @@ def get(self, href, data):
460463
data = data.encode("utf-8")
461464
self.requests_mock.get(href, content=data)
462465

466+
@staticmethod
467+
def _normalize_params(url: str, ignore_params: frozenset) -> tuple:
468+
"""Parse URL query string, remove ignored params, return hashable normalized form."""
469+
parsed = urllib.parse.urlparse(url)
470+
params = urllib.parse.parse_qs(parsed.query, keep_blank_values=True)
471+
for p in ignore_params:
472+
params.pop(p, None)
473+
return tuple(sorted((k, tuple(sorted(vs))) for k, vs in params.items()))
474+
475+
def get_flexible(self, href: str, data, ignore_params=()):
476+
"""Register a GET mock that matches URLs ignoring specified query parameters.
477+
Example usage:
478+
mock.get_flexible(
479+
"https://stac.test/search?limit=100&bbox=1,2,3,4&collections=foo",
480+
data=my_response,
481+
ignore_params=("limit",),
482+
)
483+
484+
All registrations for the same base URL path share a single dispatcher
485+
callback, so call get_flexible() for **all** URLs under a given path
486+
(don't mix with get() for the same base path).
487+
"""
488+
ignore = frozenset(ignore_params)
489+
parsed = urllib.parse.urlparse(href)
490+
base_url = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
491+
norm_key = self._normalize_params(href, ignore)
492+
493+
if isinstance(data, str):
494+
data_bytes = data.encode("utf-8")
495+
else:
496+
data_bytes = data
497+
498+
if base_url not in self._flexible_responses:
499+
self._flexible_responses[base_url] = {}
500+
self._flexible_ignore[base_url] = ignore
501+
502+
self._flexible_responses[base_url][norm_key] = data_bytes
503+
self._install_flexible_handler(base_url)
504+
505+
def _install_flexible_handler(self, base_url: str):
506+
responses = self._flexible_responses[base_url]
507+
ignore = self._flexible_ignore[base_url]
508+
normalize = self._normalize_params
509+
510+
def lookup(url: str) -> typing.Optional[bytes]:
511+
key = normalize(url, ignore)
512+
return responses.get(key)
513+
514+
def urllib_handler(req):
515+
data = lookup(req.full_url)
516+
if data is not None:
517+
return UrllibMocker.Response(data=data)
518+
return UrllibMocker.Response(code=404, msg=f"No flexible match for {req.full_url}")
519+
520+
self.urllib_mock.register(method="GET", url=base_url, response=urllib_handler)
521+
522+
def requests_handler(request, context):
523+
data = lookup(request.url)
524+
if data is not None:
525+
return data
526+
context.status_code = 404
527+
return b"Not Found"
528+
529+
self.requests_mock.get(base_url, content=requests_handler)
530+
463531

464532
@pytest.fixture
465533
def urllib_and_request_mock(urllib_mock, requests_mock) -> UrllibAndRequestMocker:

tests/test_api_result.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3748,22 +3748,30 @@ def item_json(path):
37483748
"https://stac.test/collections/sentinel-2-l2a",
37493749
data=get_test_data_file("stac/issue830_alternate_url/collections_sentinel-2-l2a.json").read_text(),
37503750
)
3751-
urllib_and_request_mock.get(
3752-
"https://stac.test/search", data=item_json("stac/issue830_alternate_url/search.json")
3751+
# Use get_flexible for search URLs to avoid coupling to the page limit default
3752+
urllib_and_request_mock.get_flexible(
3753+
"https://stac.test/search", data=item_json("stac/issue830_alternate_url/search.json"),
3754+
ignore_params=("limit",),
37533755
)
3754-
urllib_and_request_mock.get(
3755-
"https://stac.test/search?limit=20&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-23T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a&fields=%2Bproperties.proj%3Abbox%2C%2Bproperties.proj%3Aepsg%2C%2Bproperties.proj%3Ashape",
3756-
data=item_json("stac/issue830_alternate_url/search_queried.json"))
3757-
urllib_and_request_mock.get(
3758-
"https://stac.test/search?limit=20&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-16T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a&fields=%2Bproperties.proj%3Abbox%2C%2Bproperties.proj%3Ashape%2C%2Bproperties.proj%3Aepsg&token=MTcxOTEzOTU3OTAyNCxTMkJfTVNJTDJBXzIwMjQwNjIzVDEwNDYxOV9OMDUxMF9SMDUxX1QzMVVGU18yMDI0MDYyM1QxMjIxNTYsc2VudGluZWwtMi1sMmE%3D",
3759-
data=item_json("stac/issue830_alternate_url/search_queried_page2.json"))
3760-
urllib_and_request_mock.get(
3761-
"https://stac.test/search?limit=20&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-23T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a&fields=%2Btype%2C%2Bgeometry%2C%2Bproperties%2C%2Bid%2C%2Bbbox%2C%2Bstac_version%2C%2Bassets%2C%2Blinks%2C%2Bcollection",
3756+
urllib_and_request_mock.get_flexible(
3757+
"https://stac.test/search?limit=100&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-23T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a&fields=%2Bproperties.proj%3Abbox%2C%2Bproperties.proj%3Aepsg%2C%2Bproperties.proj%3Ashape",
37623758
data=item_json("stac/issue830_alternate_url/search_queried.json"),
3759+
ignore_params=("limit",),
37633760
)
3764-
urllib_and_request_mock.get(
3765-
"https://stac.test/search?limit=20&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-23T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a",
3761+
urllib_and_request_mock.get_flexible(
3762+
"https://stac.test/search?limit=100&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-16T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a&fields=%2Bproperties.proj%3Abbox%2C%2Bproperties.proj%3Ashape%2C%2Bproperties.proj%3Aepsg&token=MTcxOTEzOTU3OTAyNCxTMkJfTVNJTDJBXzIwMjQwNjIzVDEwNDYxOV9OMDUxMF9SMDUxX1QzMVVGU18yMDI0MDYyM1QxMjIxNTYsc2VudGluZWwtMi1sMmE%3D",
3763+
data=item_json("stac/issue830_alternate_url/search_queried_page2.json"),
3764+
ignore_params=("limit",),
3765+
)
3766+
urllib_and_request_mock.get_flexible(
3767+
"https://stac.test/search?limit=100&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-23T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a&fields=%2Btype%2C%2Bgeometry%2C%2Bproperties%2C%2Bid%2C%2Bbbox%2C%2Bstac_version%2C%2Bassets%2C%2Blinks%2C%2Bcollection",
3768+
data=item_json("stac/issue830_alternate_url/search_queried.json"),
3769+
ignore_params=("limit",),
3770+
)
3771+
urllib_and_request_mock.get_flexible(
3772+
"https://stac.test/search?limit=100&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2024-06-23T00%3A00%3A00Z%2F2024-06-23T23%3A59%3A59.999000Z&collections=sentinel-2-l2a",
37663773
data=item_json("stac/issue830_alternate_url/search_queried.json"),
3774+
ignore_params=("limit",),
37673775
)
37683776

37693777
process_graph = {
@@ -3842,9 +3850,10 @@ def item_json(path):
38423850
"stac/issue830_alternate_url_s3/catalogue.dataspace.copernicus.eu/stac/index.json"
38433851
).read_text(),
38443852
)
3845-
urllib_and_request_mock.get(
3853+
urllib_and_request_mock.get_flexible(
38463854
"https://catalogue.dataspace.copernicus.eu/stac/search?limit=20&bbox=5.07%2C51.215%2C5.08%2C51.22&datetime=2023-06-01T00%3A00%3A00Z%2F2023-06-30T23%3A59%3A59.999000Z&collections=GLOBAL-MOSAICS",
38473855
data=item_json("stac/issue830_alternate_url_s3/catalogue.dataspace.copernicus.eu/stac/search_queried.json"),
3856+
ignore_params=("limit",),
38483857
)
38493858

38503859
process_graph = {
@@ -4392,7 +4401,11 @@ def feature_collection(request, _) -> dict:
43924401
assert "fields" not in request.qs
43934402
assert request.qs.get("filter-lang") == filter_lang
43944403
assert request.qs.get("filter") == filter
4395-
assert request.body == body or request.json() == body
4404+
if body:
4405+
for k, v in body.items():
4406+
if k == "limit":
4407+
continue
4408+
assert (isinstance(request.body, dict) and request.body.get(k) == v) or request.json().get(k) == v
43964409

43974410
def item(path) -> dict:
43984411
return json.loads(
@@ -4748,9 +4761,10 @@ def item_json(path):
47484761
"stac/issue_copernicus_global_mosaics/stac.dataspace.copernicus.eu/v1/collections/sentinel-2-global-mosaics.json"
47494762
),
47504763
)
4751-
urllib_and_request_mock.get(
4764+
urllib_and_request_mock.get_flexible(
47524765
"https://stac.dataspace.copernicus.eu/v1/search?limit=20&bbox=2.1%2C35.31%2C2.2%2C35.32&datetime=2023-01-01T00%3A00%3A00Z%2F2023-01-01T23%3A59%3A59.999000Z&collections=sentinel-2-global-mosaics",
47534766
data=item_json("stac/issue_copernicus_global_mosaics/stac.dataspace.copernicus.eu/v1/search_queried.json"),
4767+
ignore_params=("limit",),
47544768
)
47554769
urllib_and_request_mock.get(
47564770
"https://stac.dataspace.copernicus.eu/v1/",

0 commit comments

Comments
 (0)