Skip to content

Commit 41f094f

Browse files
authored
Merge pull request #122 from stac-utils/pv/clarify-conformance-classes-run
Validate Item Search pagination
2 parents def6e7a + 2562dc3 commit 41f094f

File tree

6 files changed

+251
-23
lines changed

6 files changed

+251
-23
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
/docs/_build/
99
/src/*.egg-info/
1010
__pycache__/
11+
.ipynb_checkpoints/

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.2.0] - TBD
9+
10+
### Added
11+
12+
- Validate Item Search and Features pagination
13+
14+
### Changed
15+
16+
- More explicit reporting of which conformance classes validations are run or not run.
17+
818
## [0.1.1] - 2022-10-11
919

1020
Release is primarily to publish to Read the Docs as a version.

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ and then run it:
4747
$ stac-api-validator \
4848
--root-url https://planetarycomputer.microsoft.com/api/stac/v1/ \
4949
--conformance core \
50+
--conformance features \
5051
--conformance item-search \
5152
--collection sentinel-2-l2a \
5253
--geometry '{"type": "Polygon", "coordinates": [[[100.0, 0.0], [101.0, 0.0], [101.0, 1.0], [100.0, 1.0], [100.0, 0.0]]]}'
@@ -67,7 +68,8 @@ class will always be validated, even if not specified.
6768

6869
If `item-search`, `collections`, and/or `features` are specified, the `--collection` and `--geometry` parameters must also
6970
be specified. The `--collection` parameter specifies the name of a collection to use for some of the validations.
70-
The `--geometry` should specify an AOI over which there are some results in that collection.
71+
The `--geometry` should specify an AOI over which there are between 100 and 20,000 results for the collection (more
72+
results means longer time to run).
7173

7274
## Features
7375

poetry.lock

Lines changed: 13 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@ Changelog = "https://github.com/stac-utils/stac-api-validator/releases"
1717

1818
[tool.poetry.dependencies]
1919
python = "^3.10"
20-
click = ">=8.0.1"
21-
pystac-client = "^0.5.0"
20+
click = "^8.1.3"
21+
pystac-client = "^0.5.1"
2222
requests = "^2.28.1"
2323
pystac = {extras = ["orjson"], version = "^1.6.1"}
2424
jsonschema = "^4.16.0"
2525
PyYAML = "6.0"
2626
Shapely = "1.8.4"
27+
more_itertools = "^8.14.0"
2728

2829
[tool.poetry.dev-dependencies]
2930
Pygments = ">=2.10.0"

src/stac_api_validator/validations.py

Lines changed: 221 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import requests
1313
import yaml
14+
from more_itertools import take
1415
from pystac import STACValidationError
1516
from pystac_client import Client
1617
from shapely.geometry import shape
@@ -286,7 +287,12 @@ def validate_api(
286287
print("Validating STAC API - Features conformance class.")
287288
validate_collections(root_body, collection, warnings, errors) # type:ignore
288289
validate_features(
289-
root_body, conforms_to, collection, warnings, errors # type:ignore
290+
root_body,
291+
conforms_to,
292+
collection,
293+
geometry,
294+
warnings,
295+
errors,
290296
)
291297
else:
292298
print("Skipping STAC API - Features conformance class.")
@@ -320,8 +326,8 @@ def validate_api(
320326

321327

322328
def link_by_rel(
323-
links: Optional[List[Dict[str, str]]], rel: str
324-
) -> Optional[Dict[str, str]]:
329+
links: Optional[List[Dict[str, Any]]], rel: str
330+
) -> Optional[Dict[str, Any]]:
325331
if not links:
326332
return None
327333
else:
@@ -455,12 +461,21 @@ def validate_collections(
455461
def validate_features(
456462
root_body: Dict[str, Any],
457463
conforms_to: List[str],
458-
collection: str,
464+
collection: Optional[str],
465+
geometry: Optional[str],
459466
warnings: List[str],
460467
errors: List[str],
461468
) -> None:
462469
print("WARNING: Features validation is not yet fully implemented.")
463470

471+
if not geometry:
472+
errors.append("Geometry parameter required for running Features validations.")
473+
return
474+
475+
if not collection:
476+
errors.append("Collection parameter required for running Features validations.")
477+
return
478+
464479
if conforms_to and (
465480
req_ccs := [
466481
x
@@ -524,24 +539,42 @@ def validate_features(
524539
f"service-desc ({conformance}): must return JSON, instead got non-JSON text"
525540
)
526541

527-
# this is hard to figure out, since it's likely a mistake, but most apis can't undo it for
528-
# backwards-compat reasons
529-
if not (link_by_rel(root_links, "collections") is None):
530-
warnings.append(
531-
"/ Link[rel=collections] is a non-standard relation. Use Link[rel=data instead]"
532-
)
542+
# this is hard to figure out, since it's likely a mistake, but most apis can't undo it for
543+
# backwards-compat reasons
544+
if not (link_by_rel(root_links, "collections") is None):
545+
warnings.append(
546+
"/ Link[rel=collections] is a non-standard relation. Use Link[rel=data instead]"
547+
)
533548

534-
# todo: validate items exists
549+
# todo: validate items exists in the collection
535550

536-
if not (collections_url := link_by_rel(root_links, "data")):
537-
errors.append("/: Link[rel=data] must href /collections")
551+
if not (collections_url := link_by_rel(root_links, "data")):
552+
errors.append("/: Link[rel=data] must href /collections")
553+
else:
554+
item_url = f"{collections_url['href']}/{collection}/items/non-existent-item"
555+
r = requests.get(item_url)
556+
if r.status_code != 404:
557+
errors.append(
558+
f"[Features] GET {item_url} (non-existent item) returned status code {r.status_code} instead of 404"
559+
)
560+
561+
if not (collections_url := link_by_rel(root_links, "data")):
562+
errors.append(
563+
"/: Link[rel=data] must href /collections, cannot run pagination test"
564+
)
565+
else:
566+
if not (self_link := link_by_rel(root_links, "self")):
567+
errors.append("/: Link[rel=self] missing")
538568
else:
539-
item_url = f"{collections_url['href']}/{collection}/items/non-existent-item"
540-
r = requests.get(item_url)
541-
if r.status_code != 404:
542-
errors.append(
543-
f"[Features] GET {item_url} (non-existent item) returned status code {r.status_code} instead of 404"
544-
)
569+
validate_item_pagination(
570+
root_url=self_link.get("href", ""),
571+
search_url=f"{collections_url['href']}/{collection}/items",
572+
collection=None,
573+
geometry=geometry,
574+
post=False,
575+
errors=errors,
576+
use_pystac_client=False,
577+
)
545578

546579
# if any(cc_features_fields_regex.fullmatch(x) for x in conforms_to):
547580
# print("STAC API - Features - Fields extension conformance class found.")
@@ -631,6 +664,15 @@ def validate_item_search(
631664
geometry=geometry,
632665
)
633666

667+
validate_item_pagination(
668+
root_url=root_url,
669+
search_url=search_url,
670+
collection=collection,
671+
geometry=geometry,
672+
post=post,
673+
errors=errors,
674+
)
675+
634676
# if any(cc_item_search_fields_regex.fullmatch(x) for x in conforms_to):
635677
# print("STAC API - Item Search - Fields extension conformance class found.")
636678
#
@@ -1034,6 +1076,166 @@ def validate_item_search_bbox_xor_intersects(
10341076
)
10351077

10361078

1079+
def validate_item_pagination(
1080+
root_url: str,
1081+
search_url: str,
1082+
collection: Optional[str],
1083+
geometry: str,
1084+
post: bool,
1085+
errors: List[str],
1086+
use_pystac_client: bool = True,
1087+
) -> None:
1088+
url = f"{search_url}?limit=1"
1089+
if collection is not None:
1090+
url = f"{url}&collections={collection}"
1091+
1092+
r = requests.get(url)
1093+
if not r.status_code == 200:
1094+
errors.append(
1095+
"STAC API - Item Search GET pagination get failed for initial request"
1096+
)
1097+
else:
1098+
try:
1099+
first_body = r.json()
1100+
if link := link_by_rel(first_body.get("links"), "next"):
1101+
if (method := link.get("method")) and method != "GET":
1102+
errors.append(
1103+
f"STAC API - Item Search GET pagination first request 'next' link relation has method {method} instead of 'GET'"
1104+
)
1105+
1106+
next_url = link.get("href")
1107+
if next_url is None:
1108+
errors.append(
1109+
"STAC API - Item Search GET pagination first request 'next' link relation missing href"
1110+
)
1111+
else:
1112+
if url == next_url:
1113+
errors.append(
1114+
"STAC API - Item Search GET pagination next href same as first url"
1115+
)
1116+
1117+
r = requests.get(next_url)
1118+
if not r.status_code == 200:
1119+
errors.append(
1120+
f"STAC API - Item Search GET pagination get failed for next url {next_url}"
1121+
)
1122+
else:
1123+
errors.append(
1124+
"STAC API - Item Search GET pagination first request had no 'next' link relation"
1125+
)
1126+
1127+
except json.decoder.JSONDecodeError:
1128+
errors.append(
1129+
f"STAC API - Item Search GET pagination response failed {url}"
1130+
)
1131+
1132+
max_items = 100
1133+
1134+
# todo: how to paginate over items, not just search?
1135+
1136+
if use_pystac_client:
1137+
client = Client.open(root_url)
1138+
search = client.search(
1139+
method="GET", collections=[collection], max_items=max_items, limit=5
1140+
)
1141+
1142+
items = list(search.items_as_dicts())
1143+
1144+
if len(items) > max_items:
1145+
errors.append(
1146+
"STAC API - Item Search GET pagination - more than max items returned from paginating"
1147+
)
1148+
1149+
if len(items) > len({item["id"] for item in items}):
1150+
errors.append(
1151+
"STAC API - Item Search GET pagination - duplicate items returned from paginating items"
1152+
)
1153+
1154+
# GET paging has a problem with intersects https://github.com/stac-utils/pystac-client/issues/335
1155+
# search = client.search(method="GET", collections=[collection], intersects=geometry)
1156+
# if len(list(take(20000, search.items_as_dicts()))) == 20000:
1157+
# errors.append(
1158+
# f"STAC API - Item Search GET pagination - paged through 20,000 results. This could mean the last page "
1159+
# f"of results references itself, or your collection and geometry combination has too many results."
1160+
# )
1161+
1162+
if post:
1163+
initial_json_body = {"limit": 1, "collections": [collection]}
1164+
r = requests.post(search_url, json=initial_json_body)
1165+
if not r.status_code == 200:
1166+
errors.append(
1167+
"STAC API - Item Search POST pagination get failed for initial request"
1168+
)
1169+
else:
1170+
try:
1171+
first_body = r.json()
1172+
if link := link_by_rel(first_body.get("links"), "next"):
1173+
if (method := link.get("method")) and method != "POST":
1174+
errors.append(
1175+
f"STAC API - Item Search POST pagination first request 'next' link relation has method {method} instead of 'POST'"
1176+
)
1177+
1178+
next_url = link.get("href")
1179+
if next_url is None:
1180+
errors.append(
1181+
"STAC API - Item Search POST pagination first request 'next' link relation missing href"
1182+
)
1183+
else:
1184+
if url == next_url:
1185+
errors.append(
1186+
"STAC API - Item Search POST pagination next href same as first url"
1187+
)
1188+
1189+
next_body: Dict[str, Any] = link.get("body", {})
1190+
if not link.get("merge", False):
1191+
second_json_body = next_body
1192+
else:
1193+
second_json_body = initial_json_body
1194+
second_json_body.update(next_body)
1195+
1196+
r = requests.post(next_url, json=second_json_body)
1197+
if not r.status_code == 200:
1198+
errors.append(
1199+
f"STAC API - Item Search POST pagination get failed for next url {next_url} with body {second_json_body}"
1200+
)
1201+
else:
1202+
r.json()
1203+
else:
1204+
errors.append(
1205+
"STAC API - Item Search POST pagination first request had no 'next' link relation"
1206+
)
1207+
1208+
except json.decoder.JSONDecodeError:
1209+
errors.append("STAC API - Item Search POST pagination response failed")
1210+
1211+
max_items = 100
1212+
client = Client.open(root_url)
1213+
search = client.search(
1214+
method="POST", collections=[collection], max_items=max_items, limit=5
1215+
)
1216+
1217+
items = list(search.items_as_dicts())
1218+
1219+
if len(items) > max_items:
1220+
errors.append(
1221+
"STAC API - Item Search POST pagination - more than max items returned from paginating"
1222+
)
1223+
1224+
if len(items) > len({item["id"] for item in items}):
1225+
errors.append(
1226+
"STAC API - Item Search POST pagination - duplicate items returned from paginating items"
1227+
)
1228+
1229+
search = client.search(
1230+
method="POST", collections=[collection], intersects=geometry
1231+
)
1232+
if len(list(take(20000, search.items_as_dicts()))) == 20000:
1233+
errors.append(
1234+
"STAC API - Item Search POST pagination - paged through 20,000 results. This could mean the last page "
1235+
"of results references itself, or your collection and geometry combination has too many results."
1236+
)
1237+
1238+
10371239
def validate_item_search_intersects(
10381240
search_url: str, collection: str, post: bool, errors: List[str], geometry: str
10391241
) -> None:

0 commit comments

Comments
 (0)