Skip to content

Sorting/paging inconsistencies and errors #514

@floeschau

Description

@floeschau

Hello there,

I found an issue with searches, when sorting is used in combination with paging, especially when the a field that is a sort criterion is missing in some items. The result is not complete and can even be different depending on the sort order (ascending or descending). Items in the search result can even be duplicated.

The script below reproduces the problem (check the endpoint variable to make sure it uses the correct URL). It creates 10 very simple STAC items of which some have eo:cloud_cover values (with duplicates), but some items don't have that field.

import requests
import datetime
from urllib.parse import urljoin


endpoint = "http://app-pgstac"
collection_name = "sort-test"


def create_test_collection():
    response = requests.delete(f"{endpoint}/collections/{collection_name}")
    body = {
        "type": "Collection",
        "id": collection_name,
        "stac_version": "1.0.0",
        "description": "Test collection to demonstrate sorting issues",
        "links": [
            { "rel": "root", "href": None, "type": "application/json" }
        ],
        "stac_extensions": [],
        "extent": {
            "spatial": { "bbox": [-180, -90, 180, 90] },
            "temporal": { "interval": ["2000-01-01T00:00:00Z", None] }
        }
    }
    requests.post(f"{endpoint}/collections", json=body)


def add_test_data():
    for i in range(10):
        cloud_cover = (i // 4) * 10
        now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

        item = {
            "stac_version": "1.0.0",
            "type": "Feature",
            "id": "item-{0:02}".format(i + 1),
            "geometry": {
                "type": "Point",
                "coordinates": [12, 41]
            },
            "properties": {
                "datetime": "2022-01-02T03:04:05.678Z",
                "platform": "myplatform",
            },
            "assets": {},
            "links": [],
            "collection": collection_name
        }
        if i % 3 != 0:
            item["properties"]["eo:cloud_cover"] = cloud_cover

        requests.post(f"{endpoint}/collections/{collection_name}/items", json=item)


def search(body: dict):
    url = f"{endpoint}/search"
    if "limit" not in body: body["limit"] = 2

    received_items = []
    page = 1

    while True:
        print(f"Page {page}: POST {endpoint}: {body}")
        response = requests.post(url, json=body)

        if response.status_code >= 300:
            print(f"ERROR RESPONSE: {response.text}")
            return None

        result = response.json()
        for item in result["features"]:
            print(f"- ID: {item['id']}")
            received_items.append(item)
        print("----")

        next_link = next((link for link in result["links"] if "rel" in link and link["rel"] == "next"), None)

        if next_link:
            page += 1
            url = next_link["href"]
            if "body" in next_link:
                body = next_link["body"]
        else:
            break

    print(f"Summary ({len(received_items)} items):")
    for item in received_items:
        print("- ID {0} (eo:cloud_cover = {1})".format(item["id"], item["properties"]["eo:cloud_cover"] if "eo:cloud_cover" in item["properties"] else "--"))

    return received_items


create_test_collection()
add_test_data()

for sort_dir in ["asc", "desc"]:
    print(f"\n======== Sort direction: {sort_dir} ========\n")
    body = {
        "collections": [collection_name],
        "filter-lang": "cql2-json",
        "sortby": [
            { "field": "properties.eo:cloud_cover", "direction": sort_dir }
        ]
    }
    search(body=body)

print(f"\n======== Sort direction: asc + desc ========\n")
body = {
    "collections": [collection_name],
    "filter-lang": "cql2-json",
    "sortby": [
        { "field": "properties.eo:cloud_cover", "direction": "asc" },
        { "field": "id", "direction": "desc" }
    ]
}
search(body=body)

When making a search request for the whole collection (without filter) ordering by eo:cloud_cover and with a limit of 2 items per page, all 10 items should be returned (in 5 pages), but there are only 6 items (ascending sort) and 4 items (descending sort), respectively.

Furthermore. when sorting by two criteria in opposed directions, the second page (with the next token) fails with an PostgreSQL syntax error.

This is the output:


======== Sort direction: asc ========

Page 1: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'asc'}], 'limit': 2}
- ID: item-03
- ID: item-02
----
Page 2: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'asc'}], 'limit': 2, 'token': 'next:item-02'}
- ID: item-03
- ID: item-08
----
Page 3: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'asc'}], 'limit': 2, 'token': 'next:item-08'}
- ID: item-09
- ID: item-10
----
Summary (6 items):
- ID item-03 (eo:cloud_cover = 0)
- ID item-02 (eo:cloud_cover = 0)
- ID item-03 (eo:cloud_cover = 0)
- ID item-08 (eo:cloud_cover = 10)
- ID item-09 (eo:cloud_cover = 20)
- ID item-10 (eo:cloud_cover = --)

======== Sort direction: desc ========

Page 1: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'desc'}], 'limit': 2}
- ID: item-10
- ID: item-07
----
Page 2: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'desc'}], 'limit': 2, 'token': 'next:item-07'}
- ID: item-04
- ID: item-01
----
Page 3: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'desc'}], 'limit': 2, 'token': 'next:item-01'}
----
Summary (4 items):
- ID item-10 (eo:cloud_cover = --)
- ID item-07 (eo:cloud_cover = --)
- ID item-04 (eo:cloud_cover = --)
- ID item-01 (eo:cloud_cover = --)

======== Sort direction: asc + desc ========

Page 1: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'asc'}, {'field': 'id', 'direction': 'desc'}], 'limit': 2}
- ID: item-03
- ID: item-02
----
Page 2: POST http://app-pgstac: {'collections': ['sort-test'], 'filter-lang': 'cql2-json', 'sortby': [{'field': 'properties.eo:cloud_cover', 'direction': 'asc'}, {'field': 'id', 'direction': 'desc'}], 'limit': 2, 'token': 'next:item-02'}
ERROR RESPONSE: {"code":"PostgresSyntaxError","description":"syntax error at or near \")\""}

Has anybody else noticed issues like these? They are more likely to be found in large heterogeneous collections that might not be typical, but I think these are problems that should be fixed.

Thank you for any feedback!

Metadata

Metadata

Assignees

Labels

bugSomething isn't workingpgstac

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions