diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index abf6ebfa..25865744 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -28,6 +28,7 @@ jobs: xpack.security.enabled: false xpack.security.transport.ssl.enabled: false ES_JAVA_OPTS: -Xms512m -Xmx1g + action.destructive_requires_name: false ports: - 9200:9200 @@ -44,6 +45,7 @@ jobs: xpack.security.enabled: false xpack.security.transport.ssl.enabled: false ES_JAVA_OPTS: -Xms512m -Xmx1g + action.destructive_requires_name: false ports: - 9400:9400 @@ -60,6 +62,7 @@ jobs: plugins.security.disabled: true plugins.security.ssl.http.enabled: true OPENSEARCH_JAVA_OPTS: -Xms512m -Xmx512m + action.destructive_requires_name: false ports: - 9202:9202 @@ -120,5 +123,6 @@ jobs: ES_PORT: ${{ matrix.backend == 'elasticsearch7' && '9400' || matrix.backend == 'elasticsearch8' && '9200' || '9202' }} ES_HOST: 172.17.0.1 ES_USE_SSL: false + DATABASE_REFRESH: true ES_VERIFY_CERTS: false BACKEND: ${{ matrix.backend == 'elasticsearch7' && 'elasticsearch' || matrix.backend == 'elasticsearch8' && 'elasticsearch' || 'opensearch' }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ed76fcc..52ce5f2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,32 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Added + +- Added comprehensive index management system with dynamic selection and insertion strategies for improved performance and scalability [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added `ENABLE_DATETIME_INDEX_FILTERING` environment variable to enable datetime-based index selection using collection IDs. When enabled, the system creates indexes with UUID-based names and manages them through time-based aliases. Default is `false`. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added `DATETIME_INDEX_MAX_SIZE_GB` environment variable to set maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Note: add +20% to target size due to ES/OS compression. Default is `25` GB. Only applies when `ENABLE_DATETIME_INDEX_FILTERING` is enabled. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added index operations system with unified interface for both Elasticsearch and OpenSearch [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - `IndexOperations` class with common index creation and management methods + - UUID-based physical index naming: `{prefix}_{collection-id}_{uuid4}` + - Alias management: main collection alias, temporal aliases, and closed index aliases + - Automatic alias updates when indexes reach size limits +- Added datetime-based index selection strategies with caching support [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - `DatetimeBasedIndexSelector` for temporal filtering with intelligent caching + - `IndexCacheManager` with configurable TTL-based cache expiration (default 1 hour) + - `IndexAliasLoader` for alias management and cache refresh + - `UnfilteredIndexSelector` as fallback for returning all available indexes +- Added index insertion strategies with automatic partitioning [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - Simple insertion strategy (`SimpleIndexInserter`) for traditional single-index-per-collection approach + - Datetime-based insertion strategy (`DatetimeIndexInserter`) with time-based partitioning + - Automatic index size monitoring and splitting when limits exceeded + - Handling of chronologically early data and bulk operations +- Added index management utilities [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - `IndexSizeManager` for size monitoring and overflow handling with compression awareness + - `DatetimeIndexManager` for datetime-based index operations and validation + - Factory patterns (`IndexInsertionFactory`, `IndexSelectorFactory`) for strategy creation based on configuration + + ## [v6.1.0] - 2025-07-24 ### Added diff --git a/Makefile b/Makefile index c23ca951..204b31a1 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ run_os = docker compose \ .PHONY: image-deploy-es image-deploy-es: docker build -f dockerfiles/Dockerfile.dev.es -t stac-fastapi-elasticsearch:latest . - + .PHONY: image-deploy-os image-deploy-os: docker build -f dockerfiles/Dockerfile.dev.os -t stac-fastapi-opensearch:latest . @@ -71,14 +71,19 @@ test-opensearch: -$(run_os) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest' docker compose down -.PHONY: test -test: - -$(run_es) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh elasticsearch:9200 && cd stac_fastapi/tests/ && pytest --cov=stac_fastapi --cov-report=term-missing' +.PHONY: test-datetime-filtering-es +test-datetime-filtering-es: + -$(run_es) /bin/bash -c 'export ENABLE_DATETIME_INDEX_FILTERING=true && ./scripts/wait-for-it-es.sh elasticsearch:9200 && cd stac_fastapi/tests/ && pytest -s --cov=stac_fastapi --cov-report=term-missing -m datetime_filtering' docker compose down - -$(run_os) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest --cov=stac_fastapi --cov-report=term-missing' +.PHONY: test-datetime-filtering-os +test-datetime-filtering-os: + -$(run_os) /bin/bash -c 'export ENABLE_DATETIME_INDEX_FILTERING=true && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest -s --cov=stac_fastapi --cov-report=term-missing -m datetime_filtering' docker compose down +.PHONY: test +test: test-elasticsearch test-datetime-filtering-es test-opensearch test-datetime-filtering-os + .PHONY: run-database-es run-database-es: docker compose run --rm elasticsearch diff --git a/README.md b/README.md index 9e5a4674..977a351a 100644 --- a/README.md +++ b/README.md @@ -230,6 +230,81 @@ You can customize additional settings in your `.env` file: > [!NOTE] > The variables `ES_HOST`, `ES_PORT`, `ES_USE_SSL`, `ES_VERIFY_CERTS` and `ES_TIMEOUT` apply to both Elasticsearch and OpenSearch backends, so there is no need to rename the key names to `OS_` even if you're using OpenSearch. +# Datetime-Based Index Management + +## Overview + +SFEOS supports two indexing strategies for managing STAC items: + +1. **Simple Indexing** (default) - One index per collection +2. **Datetime-Based Indexing** - Time-partitioned indexes with automatic management + +The datetime-based indexing strategy is particularly useful for large temporal datasets. When a user provides a datetime parameter in a query, the system knows exactly which index to search, providing **multiple times faster searches** and significantly **reducing database load**. + +## When to Use + +**Recommended for:** +- Systems with large collections containing millions of items +- Systems requiring high-performance temporal searching + +**Pros:** +- Multiple times faster queries with datetime filter +- Reduced database load - only relevant indexes are searched + +**Cons:** +- Slightly longer item indexing time (automatic index management) +- Greater management complexity + +## Configuration + +### Enabling Datetime-Based Indexing + +Enable datetime-based indexing by setting the following environment variable: + +```bash +ENABLE_DATETIME_INDEX_FILTERING=true +``` + +### Related Configuration Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `ENABLE_DATETIME_INDEX_FILTERING` | Enables time-based index partitioning | `false` | `true` | +| `DATETIME_INDEX_MAX_SIZE_GB` | Maximum size limit for datetime indexes (GB) - note: add +20% to target size due to ES/OS compression | `25` | `50` | +| `STAC_ITEMS_INDEX_PREFIX` | Prefix for item indexes | `items_` | `stac_items_` | + +## How Datetime-Based Indexing Works + +### Index and Alias Naming Convention + +The system uses a precise naming convention: + +**Physical indexes:** +``` +{ITEMS_INDEX_PREFIX}{collection-id}_{uuid4} +``` + +**Aliases:** +``` +{ITEMS_INDEX_PREFIX}{collection-id} # Main collection alias +{ITEMS_INDEX_PREFIX}{collection-id}_{start-datetime} # Temporal alias +{ITEMS_INDEX_PREFIX}{collection-id}_{start-datetime}_{end-datetime} # Closed index alias +``` + +**Example:** + +*Physical indexes:* +- `items_sentinel-2-l2a_a1b2c3d4-e5f6-7890-abcd-ef1234567890` + +*Aliases:* +- `items_sentinel-2-l2a` - main collection alias +- `items_sentinel-2-l2a_2024-01-01` - active alias from January 1, 2024 +- `items_sentinel-2-l2a_2024-01-01_2024-03-15` - closed index alias (reached size limit) + +### Index Size Management + +**Important - Data Compression:** Elasticsearch and OpenSearch automatically compress data. The configured `DATETIME_INDEX_MAX_SIZE_GB` limit refers to the compressed size on disk. It is recommended to add +20% to the target size to account for compression overhead and metadata. + ## Interacting with the API - **Creating a Collection**: @@ -538,4 +613,3 @@ You can customize additional settings in your `.env` file: - Ensures fair resource allocation among all clients - **Examples**: Implementation examples are available in the [examples/rate_limit](examples/rate_limit) directory. - diff --git a/compose.yml b/compose.yml index 05665595..ba898bb1 100644 --- a/compose.yml +++ b/compose.yml @@ -21,6 +21,7 @@ services: - ES_USE_SSL=false - ES_VERIFY_CERTS=false - BACKEND=elasticsearch + - DATABASE_REFRESH=true ports: - "8080:8080" volumes: @@ -72,6 +73,7 @@ services: hostname: elasticsearch environment: ES_JAVA_OPTS: -Xms512m -Xmx1g + action.destructive_requires_name: false volumes: - ./elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml - ./elasticsearch/snapshots:/usr/share/elasticsearch/snapshots @@ -86,6 +88,7 @@ services: - discovery.type=single-node - plugins.security.disabled=true - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m + - action.destructive_requires_name=false volumes: - ./opensearch/config/opensearch.yml:/usr/share/opensearch/config/opensearch.yml - ./opensearch/snapshots:/usr/share/opensearch/snapshots diff --git a/stac_fastapi/core/stac_fastapi/core/core.py b/stac_fastapi/core/stac_fastapi/core/core.py index 8d1f472b..07b17890 100644 --- a/stac_fastapi/core/stac_fastapi/core/core.py +++ b/stac_fastapi/core/stac_fastapi/core/core.py @@ -324,10 +324,15 @@ async def item_collection( search=search, collection_ids=[collection_id] ) - if datetime: - search = self.database.apply_datetime_filter( - search=search, interval=datetime + try: + search, datetime_search = self.database.apply_datetime_filter( + search=search, datetime=datetime ) + except (ValueError, TypeError) as e: + # Handle invalid interval formats if return_date fails + msg = f"Invalid interval format: {datetime}, error: {e}" + logger.error(msg) + raise HTTPException(status_code=400, detail=msg) if bbox: bbox = [float(x) for x in bbox] @@ -342,6 +347,7 @@ async def item_collection( sort=None, token=token, collection_ids=[collection_id], + datetime_search=datetime_search, ) items = [ @@ -500,10 +506,15 @@ async def post_search( search=search, collection_ids=search_request.collections ) - if search_request.datetime: - search = self.database.apply_datetime_filter( - search=search, interval=search_request.datetime + try: + search, datetime_search = self.database.apply_datetime_filter( + search=search, datetime=search_request.datetime ) + except (ValueError, TypeError) as e: + # Handle invalid interval formats if return_date fails + msg = f"Invalid interval format: {search_request.datetime}, error: {e}" + logger.error(msg) + raise HTTPException(status_code=400, detail=msg) if search_request.bbox: bbox = search_request.bbox @@ -560,6 +571,7 @@ async def post_search( token=search_request.token, sort=sort, collection_ids=search_request.collections, + datetime_search=datetime_search, ) fields = ( diff --git a/stac_fastapi/core/stac_fastapi/core/datetime_utils.py b/stac_fastapi/core/stac_fastapi/core/datetime_utils.py index f9dbacf5..87911ac5 100644 --- a/stac_fastapi/core/stac_fastapi/core/datetime_utils.py +++ b/stac_fastapi/core/stac_fastapi/core/datetime_utils.py @@ -1,4 +1,5 @@ """Utility functions to handle datetime parsing.""" + from datetime import datetime, timezone from stac_fastapi.types.rfc3339 import rfc3339_str_to_datetime diff --git a/stac_fastapi/core/stac_fastapi/core/serializers.py b/stac_fastapi/core/stac_fastapi/core/serializers.py index 9b0d36d4..d537b493 100644 --- a/stac_fastapi/core/stac_fastapi/core/serializers.py +++ b/stac_fastapi/core/stac_fastapi/core/serializers.py @@ -1,4 +1,5 @@ """Serializers.""" + import abc from copy import deepcopy from typing import Any, List, Optional diff --git a/stac_fastapi/core/stac_fastapi/core/session.py b/stac_fastapi/core/stac_fastapi/core/session.py index d5a7aa3c..990f9d73 100644 --- a/stac_fastapi/core/stac_fastapi/core/session.py +++ b/stac_fastapi/core/stac_fastapi/core/session.py @@ -1,4 +1,5 @@ """database session management.""" + import logging import attr diff --git a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py index 16a8a83d..5f100980 100644 --- a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py +++ b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py @@ -4,7 +4,7 @@ import logging from base64 import urlsafe_b64decode, urlsafe_b64encode from copy import deepcopy -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type import attr import elasticsearch.helpers as helpers @@ -27,7 +27,7 @@ PartialItem, PatchOperation, ) -from stac_fastapi.sfeos_helpers import filter +from stac_fastapi.sfeos_helpers import filter as filter_module from stac_fastapi.sfeos_helpers.database import ( apply_free_text_filter_shared, apply_intersects_filter_shared, @@ -36,7 +36,6 @@ get_queryables_mapping_shared, index_alias_by_collection_id, index_by_collection_id, - indices, mk_actions, mk_item_id, populate_sort_shared, @@ -59,9 +58,14 @@ ITEMS_INDEX_PREFIX, Geometry, ) +from stac_fastapi.sfeos_helpers.search_engine import ( + BaseIndexInserter, + BaseIndexSelector, + IndexInsertionFactory, + IndexSelectorFactory, +) from stac_fastapi.types.errors import ConflictError, NotFoundError from stac_fastapi.types.links import resolve_links -from stac_fastapi.types.rfc3339 import DateTimeType from stac_fastapi.types.stac import Collection, Item logger = logging.getLogger(__name__) @@ -139,6 +143,8 @@ class DatabaseLogic(BaseDatabaseLogic): sync_settings: SyncElasticsearchSettings = attr.ib( factory=SyncElasticsearchSettings ) + async_index_selector: BaseIndexSelector = attr.ib(init=False) + async_index_inserter: BaseIndexInserter = attr.ib(init=False) client = attr.ib(init=False) sync_client = attr.ib(init=False) @@ -147,6 +153,10 @@ def __attrs_post_init__(self): """Initialize clients after the class is instantiated.""" self.client = self.async_settings.create_client self.sync_client = self.sync_settings.create_client + self.async_index_inserter = IndexInsertionFactory.create_insertion_strategy( + self.client + ) + self.async_index_selector = IndexSelectorFactory.create_selector(self.client) item_serializer: Type[ItemSerializer] = attr.ib(default=ItemSerializer) collection_serializer: Type[CollectionSerializer] = attr.ib( @@ -216,15 +226,23 @@ async def get_one_item(self, collection_id: str, item_id: str) -> Dict: with the index for the Collection as the target index and the combined `mk_item_id` as the document id. """ try: - item = await self.client.get( + response = await self.client.search( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, ) + if response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + + return response["hits"]["hits"][0]["_source"] except ESNotFoundError: raise NotFoundError( f"Item {item_id} does not exist inside Collection {collection_id}" ) - return item["_source"] async def get_queryables_mapping(self, collection_id: str = "*") -> dict: """Retrieve mapping of Queryables for search. @@ -260,31 +278,21 @@ def apply_collections_filter(search: Search, collection_ids: List[str]): @staticmethod def apply_datetime_filter( - search: Search, interval: Optional[Union[DateTimeType, str]] - ) -> Search: + search: Search, datetime: Optional[str] + ) -> Tuple[Search, Dict[str, Optional[str]]]: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - interval: Optional datetime interval to filter by. Can be: - - A single datetime string (e.g., "2023-01-01T12:00:00") - - A datetime range string (e.g., "2023-01-01/2023-12-31") - - A datetime object - - A tuple of (start_datetime, end_datetime) + datetime: Optional[str] Returns: The filtered search object. """ - if not interval: - return search + datetime_search = return_date(datetime) - should = [] - try: - datetime_search = return_date(interval) - except (ValueError, TypeError) as e: - # Handle invalid interval formats if return_date fails - logger.error(f"Invalid interval format: {interval}, error: {e}") - return search + if not datetime_search: + return search, datetime_search if "eq" in datetime_search: # For exact matches, include: @@ -351,7 +359,10 @@ def apply_datetime_filter( ), ] - return search.query(Q("bool", should=should, minimum_should_match=1)) + return ( + search.query(Q("bool", should=should, minimum_should_match=1)), + datetime_search, + ) @staticmethod def apply_bbox_filter(search: Search, bbox: List): @@ -466,7 +477,7 @@ async def apply_cql2_filter( otherwise the original Search object. """ if _filter is not None: - es_query = filter.to_es(await self.get_queryables_mapping(), _filter) + es_query = filter_module.to_es(await self.get_queryables_mapping(), _filter) search = search.query(es_query) return search @@ -493,6 +504,7 @@ async def execute_search( token: Optional[str], sort: Optional[Dict[str, Dict[str, str]]], collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], ignore_unavailable: bool = True, ) -> Tuple[Iterable[Dict[str, Any]], Optional[int], Optional[str]]: """Execute a search query with limit and other optional parameters. @@ -503,6 +515,7 @@ async def execute_search( token (Optional[str]): The token used to return the next set of results. sort (Optional[Dict[str, Dict[str, str]]]): Specifies how the results should be sorted. collection_ids (Optional[List[str]]): The collection ids to search. + datetime_search (Dict[str, Optional[str]]): Datetime range used for index selection. ignore_unavailable (bool, optional): Whether to ignore unavailable collections. Defaults to True. Returns: @@ -523,7 +536,9 @@ async def execute_search( query = search.query.to_dict() if search.query else None - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) if len(index_param) > ES_MAX_URL_LENGTH - 300: index_param = ITEM_INDICES query = add_collections_to_body(collection_ids, query) @@ -590,6 +605,7 @@ async def aggregate( geometry_geohash_grid_precision: int, geometry_geotile_grid_precision: int, datetime_frequency_interval: str, + datetime_search, ignore_unavailable: Optional[bool] = True, ): """Return aggregations of STAC Items.""" @@ -625,7 +641,10 @@ def _fill_aggregation_parameters(name: str, agg: dict) -> dict: if k in aggregations } - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) + search_task = asyncio.create_task( self.client.search( index=index_param, @@ -667,14 +686,21 @@ async def async_prep_create_item( """ await self.check_collection_exists(collection_id=item["collection"]) + alias = index_alias_by_collection_id(item["collection"]) + doc_id = mk_item_id(item["id"], item["collection"]) - if not exist_ok and await self.client.exists( - index=index_alias_by_collection_id(item["collection"]), - id=mk_item_id(item["id"], item["collection"]), - ): - raise ConflictError( - f"Item {item['id']} in collection {item['collection']} already exists" - ) + if not exist_ok: + alias_exists = await self.client.indices.exists_alias(name=alias) + + if alias_exists: + alias_info = await self.client.indices.get_alias(name=alias) + indices = list(alias_info.keys()) + + for index in indices: + if await self.client.exists(index=index, id=doc_id): + raise ConflictError( + f"Item {item['id']} in collection {item['collection']} already exists" + ) return self.item_serializer.stac_to_db(item, base_url) @@ -805,7 +831,6 @@ async def create_item( # Extract item and collection IDs item_id = item["id"] collection_id = item["collection"] - # Ensure kwargs is a dictionary kwargs = kwargs or {} @@ -823,9 +848,12 @@ async def create_item( item=item, base_url=base_url, exist_ok=exist_ok ) + target_index = await self.async_index_inserter.get_target_index( + collection_id, item + ) # Index the item in the database await self.client.index( - index=index_alias_by_collection_id(collection_id), + index=target_index, id=mk_item_id(item_id, collection_id), document=item, refresh=refresh, @@ -904,13 +932,28 @@ async def json_patch_item( script = operations_to_script(script_operations) try: - await self.client.update( + search_response = await self.client.search( index=index_alias_by_collection_id(collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, + ) + if search_response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + document_index = search_response["hits"]["hits"][0]["_index"] + await self.client.update( + index=document_index, id=mk_item_id(item_id, collection_id), script=script, refresh=True, ) - + except ESNotFoundError: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) except BadRequestError as exc: raise HTTPException( status_code=400, detail=exc.info["error"]["caused_by"] @@ -921,7 +964,9 @@ async def json_patch_item( if new_collection_id: await self.client.reindex( body={ - "dest": {"index": f"{ITEMS_INDEX_PREFIX}{new_collection_id}"}, + "dest": { + "index": f"{ITEMS_INDEX_PREFIX}{new_collection_id}" + }, # # noqa "source": { "index": f"{ITEMS_INDEX_PREFIX}{collection_id}", "query": {"term": {"id": {"value": item_id}}}, @@ -929,8 +974,8 @@ async def json_patch_item( "script": { "lang": "painless", "source": ( - f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" - f"""ctx._source.collection = '{new_collection_id}';""" + f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" # noqa + f"""ctx._source.collection = '{new_collection_id}';""" # noqa ), }, }, @@ -990,9 +1035,9 @@ async def delete_item(self, item_id: str, collection_id: str, **kwargs: Any): try: # Perform the delete operation - await self.client.delete( + await self.client.delete_by_query( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={"query": {"term": {"_id": mk_item_id(item_id, collection_id)}}}, refresh=refresh, ) except ESNotFoundError: @@ -1092,8 +1137,10 @@ async def create_collection(self, collection: Collection, **kwargs: Any): refresh=refresh, ) - # Create the item index for the collection - await create_item_index(collection_id) + if self.async_index_inserter.should_create_collection_index(): + await self.async_index_inserter.create_simple_index( + self.client, collection_id + ) async def find_collection(self, collection_id: str) -> Collection: """Find and return a collection from the database. @@ -1367,9 +1414,12 @@ async def bulk_async( # Perform the bulk insert raise_on_error = self.async_settings.raise_on_bulk_error + actions = await self.async_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) success, errors = await helpers.async_bulk( self.client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py index 08e9a42a..ec8fb90b 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py @@ -1,4 +1,5 @@ """API configuration.""" + import logging import os import ssl diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index c323b307..4ff44ca0 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -4,7 +4,7 @@ import logging from base64 import urlsafe_b64decode, urlsafe_b64encode from copy import deepcopy -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type import attr import orjson @@ -26,7 +26,7 @@ AsyncOpensearchSettings as AsyncSearchSettings, ) from stac_fastapi.opensearch.config import OpensearchSettings as SyncSearchSettings -from stac_fastapi.sfeos_helpers import filter +from stac_fastapi.sfeos_helpers import filter as filter_module from stac_fastapi.sfeos_helpers.database import ( apply_free_text_filter_shared, apply_intersects_filter_shared, @@ -34,8 +34,6 @@ delete_item_index_shared, get_queryables_mapping_shared, index_alias_by_collection_id, - index_by_collection_id, - indices, mk_actions, mk_item_id, populate_sort_shared, @@ -55,15 +53,18 @@ COLLECTIONS_INDEX, DEFAULT_SORT, ES_COLLECTIONS_MAPPINGS, - ES_ITEMS_MAPPINGS, - ES_ITEMS_SETTINGS, ITEM_INDICES, ITEMS_INDEX_PREFIX, Geometry, ) +from stac_fastapi.sfeos_helpers.search_engine import ( + BaseIndexInserter, + BaseIndexSelector, + IndexInsertionFactory, + IndexSelectorFactory, +) from stac_fastapi.types.errors import ConflictError, NotFoundError from stac_fastapi.types.links import resolve_links -from stac_fastapi.types.rfc3339 import DateTimeType from stac_fastapi.types.stac import Collection, Item logger = logging.getLogger(__name__) @@ -104,33 +105,6 @@ async def create_collection_index() -> None: await client.close() -async def create_item_index(collection_id: str) -> None: - """ - Create the index for Items. The settings of the index template will be used implicitly. - - Args: - collection_id (str): Collection identifier. - - Returns: - None - - """ - client = AsyncSearchSettings().create_client - - index_name = f"{index_by_collection_id(collection_id)}-000001" - exists = await client.indices.exists(index=index_name) - if not exists: - await client.indices.create( - index=index_name, - body={ - "aliases": {index_alias_by_collection_id(collection_id): {}}, - "mappings": ES_ITEMS_MAPPINGS, - "settings": ES_ITEMS_SETTINGS, - }, - ) - await client.close() - - async def delete_item_index(collection_id: str) -> None: """Delete the index for items in a collection. @@ -152,6 +126,9 @@ class DatabaseLogic(BaseDatabaseLogic): async_settings: AsyncSearchSettings = attr.ib(factory=AsyncSearchSettings) sync_settings: SyncSearchSettings = attr.ib(factory=SyncSearchSettings) + async_index_selector: BaseIndexSelector = attr.ib(init=False) + async_index_inserter: BaseIndexInserter = attr.ib(init=False) + client = attr.ib(init=False) sync_client = attr.ib(init=False) @@ -159,6 +136,10 @@ def __attrs_post_init__(self): """Initialize clients after the class is instantiated.""" self.client = self.async_settings.create_client self.sync_client = self.sync_settings.create_client + self.async_index_inserter = IndexInsertionFactory.create_insertion_strategy( + self.client + ) + self.async_index_selector = IndexSelectorFactory.create_selector(self.client) item_serializer: Type[ItemSerializer] = attr.ib(default=ItemSerializer) collection_serializer: Type[CollectionSerializer] = attr.ib( @@ -234,15 +215,23 @@ async def get_one_item(self, collection_id: str, item_id: str) -> Dict: with the index for the Collection as the target index and the combined `mk_item_id` as the document id. """ try: - item = await self.client.get( + response = await self.client.search( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, ) + if response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + + return response["hits"]["hits"][0]["_source"] except exceptions.NotFoundError: raise NotFoundError( f"Item {item_id} does not exist inside Collection {collection_id}" ) - return item["_source"] async def get_queryables_mapping(self, collection_id: str = "*") -> dict: """Retrieve mapping of Queryables for search. @@ -296,31 +285,21 @@ def apply_free_text_filter(search: Search, free_text_queries: Optional[List[str] @staticmethod def apply_datetime_filter( - search: Search, interval: Optional[Union[DateTimeType, str]] - ) -> Search: + search: Search, datetime: Optional[str] + ) -> Tuple[Search, Dict[str, Optional[str]]]: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - interval: Optional datetime interval to filter by. Can be: - - A single datetime string (e.g., "2023-01-01T12:00:00") - - A datetime range string (e.g., "2023-01-01/2023-12-31") - - A datetime object - - A tuple of (start_datetime, end_datetime) + datetime: Optional[str] Returns: The filtered search object. """ - if not interval: - return search + datetime_search = return_date(datetime) - should = [] - try: - datetime_search = return_date(interval) - except (ValueError, TypeError) as e: - # Handle invalid interval formats if return_date fails - logger.error(f"Invalid interval format: {interval}, error: {e}") - return search + if not datetime_search: + return search, datetime_search if "eq" in datetime_search: # For exact matches, include: @@ -387,7 +366,10 @@ def apply_datetime_filter( ), ] - return search.query(Q("bool", should=should, minimum_should_match=1)) + return ( + search.query(Q("bool", should=should, minimum_should_match=1)), + datetime_search, + ) @staticmethod def apply_bbox_filter(search: Search, bbox: List): @@ -484,7 +466,7 @@ async def apply_cql2_filter( otherwise the original Search object. """ if _filter is not None: - es_query = filter.to_es(await self.get_queryables_mapping(), _filter) + es_query = filter_module.to_es(await self.get_queryables_mapping(), _filter) search = search.filter(es_query) return search @@ -511,6 +493,7 @@ async def execute_search( token: Optional[str], sort: Optional[Dict[str, Dict[str, str]]], collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], ignore_unavailable: bool = True, ) -> Tuple[Iterable[Dict[str, Any]], Optional[int], Optional[str]]: """Execute a search query with limit and other optional parameters. @@ -521,6 +504,7 @@ async def execute_search( token (Optional[str]): The token used to return the next set of results. sort (Optional[Dict[str, Dict[str, str]]]): Specifies how the results should be sorted. collection_ids (Optional[List[str]]): The collection ids to search. + datetime_search (Dict[str, Optional[str]]): Datetime range used for index selection. ignore_unavailable (bool, optional): Whether to ignore unavailable collections. Defaults to True. Returns: @@ -537,7 +521,9 @@ async def execute_search( search_body: Dict[str, Any] = {} query = search.query.to_dict() if search.query else None - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) if len(index_param) > ES_MAX_URL_LENGTH - 300: index_param = ITEM_INDICES query = add_collections_to_body(collection_ids, query) @@ -614,6 +600,7 @@ async def aggregate( geometry_geohash_grid_precision: int, geometry_geotile_grid_precision: int, datetime_frequency_interval: str, + datetime_search, ignore_unavailable: Optional[bool] = True, ): """Return aggregations of STAC Items.""" @@ -647,7 +634,10 @@ def _fill_aggregation_parameters(name: str, agg: dict) -> dict: if k in aggregations } - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) + search_task = asyncio.create_task( self.client.search( index=index_param, @@ -840,8 +830,13 @@ async def create_item( item = await self.async_prep_create_item( item=item, base_url=base_url, exist_ok=exist_ok ) + + target_index = await self.async_index_inserter.get_target_index( + collection_id, item + ) + await self.client.index( - index=index_alias_by_collection_id(collection_id), + index=target_index, id=mk_item_id(item_id, collection_id), body=item, refresh=refresh, @@ -920,13 +915,28 @@ async def json_patch_item( script = operations_to_script(script_operations) try: - await self.client.update( + search_response = await self.client.search( index=index_alias_by_collection_id(collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, + ) + if search_response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + document_index = search_response["hits"]["hits"][0]["_index"] + await self.client.update( + index=document_index, id=mk_item_id(item_id, collection_id), body={"script": script}, refresh=True, ) - + except exceptions.NotFoundError: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) except exceptions.RequestError as exc: raise HTTPException( status_code=400, detail=exc.info["error"]["caused_by"] @@ -945,8 +955,8 @@ async def json_patch_item( "script": { "lang": "painless", "source": ( - f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" - f"""ctx._source.collection = '{new_collection_id}';""" + f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" # noqa: E702 + f"""ctx._source.collection = '{new_collection_id}';""" # noqa: E702 ), }, }, @@ -1000,9 +1010,9 @@ async def delete_item(self, item_id: str, collection_id: str, **kwargs: Any): ) try: - await self.client.delete( + await self.client.delete_by_query( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={"query": {"term": {"_id": mk_item_id(item_id, collection_id)}}}, refresh=refresh, ) except exceptions.NotFoundError: @@ -1093,8 +1103,10 @@ async def create_collection(self, collection: Collection, **kwargs: Any): body=collection, refresh=refresh, ) - - await create_item_index(collection_id) + if self.async_index_inserter.should_create_collection_index(): + await self.async_index_inserter.create_simple_index( + self.client, collection_id + ) async def find_collection(self, collection_id: str) -> Collection: """Find and return a collection from the database. @@ -1303,6 +1315,7 @@ async def delete_collection(self, collection_id: str, **kwargs: Any): await self.client.delete( index=COLLECTIONS_INDEX, id=collection_id, refresh=refresh ) + # Delete the item index for the collection await delete_item_index(collection_id) async def bulk_async( @@ -1356,9 +1369,13 @@ async def bulk_async( return 0, [] raise_on_error = self.async_settings.raise_on_bulk_error + actions = await self.async_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) + success, errors = await helpers.async_bulk( self.client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) @@ -1413,6 +1430,11 @@ def bulk_sync( f"Performing bulk insert for collection {collection_id} with refresh={refresh}" ) + # Handle empty processed_items + if not processed_items: + logger.warning(f"No items to insert for collection {collection_id}") + return 0, [] + # Handle empty processed_items if not processed_items: logger.warning(f"No items to insert for collection {collection_id}") diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py index 1f335245..1f77cd9e 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py @@ -313,9 +313,11 @@ async def aggregate( ) if aggregate_request.datetime: - search = self.database.apply_datetime_filter( - search=search, interval=aggregate_request.datetime + search, datetime_search = self.database.apply_datetime_filter( + search=search, datetime=aggregate_request.datetime ) + else: + datetime_search = {"gte": None, "lte": None} if aggregate_request.bbox: bbox = aggregate_request.bbox @@ -414,6 +416,7 @@ async def aggregate( geometry_geohash_grid_precision, geometry_geotile_grid_precision, datetime_frequency_interval, + datetime_search, ) except Exception as error: if not isinstance(error, IndexError): diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py index 31bf28d8..bacf1ac3 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py @@ -30,11 +30,12 @@ """ # Re-export all functions for backward compatibility -from .datetime import return_date +from .datetime import extract_date, extract_first_date_from_index, return_date from .document import mk_actions, mk_item_id from .index import ( create_index_templates_shared, delete_item_index_shared, + filter_indexes_by_datetime, index_alias_by_collection_id, index_by_collection_id, indices, @@ -53,6 +54,7 @@ "delete_item_index_shared", "index_alias_by_collection_id", "index_by_collection_id", + "filter_indexes_by_datetime", "indices", # Query operations "apply_free_text_filter_shared", @@ -68,4 +70,6 @@ "get_bool_env", # Datetime utilities "return_date", + "extract_date", + "extract_first_date_from_index", ] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py index 352ed4b5..d6b68e85 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py @@ -4,14 +4,19 @@ Elasticsearch and OpenSearch query formatting. """ +import logging +import re +from datetime import date from datetime import datetime as datetime_type from typing import Dict, Optional, Union from stac_fastapi.types.rfc3339 import DateTimeType +logger = logging.getLogger(__name__) + def return_date( - interval: Optional[Union[DateTimeType, str]] + interval: Optional[Union[DateTimeType, str]], ) -> Dict[str, Optional[str]]: """ Convert a date interval to an Elasticsearch/OpenSearch query format. @@ -39,8 +44,14 @@ def return_date( if isinstance(interval, str): if "/" in interval: parts = interval.split("/") - result["gte"] = parts[0] if parts[0] != ".." else None - result["lte"] = parts[1] if len(parts) > 1 and parts[1] != ".." else None + result["gte"] = ( + parts[0] if parts[0] != ".." else datetime_type.min.isoformat() + "Z" + ) + result["lte"] = ( + parts[1] + if len(parts) > 1 and parts[1] != ".." + else datetime_type.max.isoformat() + "Z" + ) else: converted_time = interval if interval != ".." else None result["gte"] = result["lte"] = converted_time @@ -58,3 +69,53 @@ def return_date( result["lte"] = end.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" return result + + +def extract_date(date_str: str) -> date: + """Extract date from ISO format string. + + Args: + date_str: ISO format date string + + Returns: + A date object extracted from the input string. + """ + date_str = date_str.replace("Z", "+00:00") + return datetime_type.fromisoformat(date_str).date() + + +def extract_first_date_from_index(index_name: str) -> date: + """Extract the first date from an index name containing date patterns. + + Searches for date patterns (YYYY-MM-DD) within the index name string + and returns the first found date as a date object. + + Args: + index_name: Index name containing date patterns. + + Returns: + A date object extracted from the first date pattern found in the index name. + + """ + date_pattern = r"\d{4}-\d{2}-\d{2}" + match = re.search(date_pattern, index_name) + + if not match: + logger.error(f"No date pattern found in index name: '{index_name}'") + raise ValueError( + f"No date pattern (YYYY-MM-DD) found in index name: '{index_name}'" + ) + + date_string = match.group(0) + + try: + extracted_date = datetime_type.strptime(date_string, "%Y-%m-%d").date() + return extracted_date + except ValueError as e: + logger.error( + f"Invalid date format found in index name '{index_name}': " + f"'{date_string}' - {str(e)}" + ) + raise ValueError( + f"Invalid date format in index name '{index_name}': '{date_string}'" + ) from e diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py index 3305f50f..c36a36fa 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py @@ -3,9 +3,13 @@ This module provides functions for creating and managing indices in Elasticsearch/OpenSearch. """ +import re +from datetime import datetime from functools import lru_cache from typing import Any, List, Optional +from dateutil.parser import parse # type: ignore[import] + from stac_fastapi.sfeos_helpers.mappings import ( _ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE, COLLECTIONS_INDEX, @@ -66,6 +70,59 @@ def indices(collection_ids: Optional[List[str]]) -> str: ) +def filter_indexes_by_datetime( + indexes: List[str], gte: Optional[str], lte: Optional[str] +) -> List[str]: + """Filter indexes based on datetime range extracted from index names. + + Args: + indexes: List of index names containing dates + gte: Greater than or equal date filter (ISO format, optional 'Z' suffix) + lte: Less than or equal date filter (ISO format, optional 'Z' suffix) + + Returns: + List of filtered index names + """ + + def parse_datetime(dt_str: str) -> datetime: + """Parse datetime string, handling both with and without 'Z' suffix.""" + return parse(dt_str).replace(tzinfo=None) + + def extract_date_range_from_index(index_name: str) -> tuple: + """Extract start and end dates from index name.""" + date_pattern = r"(\d{4}-\d{2}-\d{2})" + dates = re.findall(date_pattern, index_name) + + if len(dates) == 1: + start_date = datetime.strptime(dates[0], "%Y-%m-%d") + max_date = datetime.max.replace(microsecond=0) + return start_date, max_date + else: + start_date = datetime.strptime(dates[0], "%Y-%m-%d") + end_date = datetime.strptime(dates[1], "%Y-%m-%d") + return start_date, end_date + + def is_index_in_range( + start_date: datetime, end_date: datetime, gte_dt: datetime, lte_dt: datetime + ) -> bool: + """Check if index date range overlaps with filter range.""" + return not ( + end_date.date() < gte_dt.date() or start_date.date() > lte_dt.date() + ) + + gte_dt = parse_datetime(gte) if gte else datetime.min.replace(microsecond=0) + lte_dt = parse_datetime(lte) if lte else datetime.max.replace(microsecond=0) + + filtered_indexes = [] + + for index in indexes: + start_date, end_date = extract_date_range_from_index(index) + if is_index_in_range(start_date, end_date, gte_dt, lte_dt): + filtered_indexes.append(index) + + return filtered_indexes + + async def create_index_templates_shared(settings: Any) -> None: """Create index templates for Elasticsearch/OpenSearch Collection and Item indices. @@ -120,11 +177,11 @@ async def delete_item_index_shared(settings: Any, collection_id: str) -> None: client = settings.create_client name = index_alias_by_collection_id(collection_id) - resolved = await client.indices.resolve_index(name=name) + resolved = await client.indices.resolve_index(name=name, ignore=[404]) if "aliases" in resolved and resolved["aliases"]: [alias] = resolved["aliases"] await client.indices.delete_alias(index=alias["indices"], name=alias["name"]) await client.indices.delete(index=alias["indices"]) else: - await client.indices.delete(index=name) + await client.indices.delete(index=name, ignore=[404]) await client.close() diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/__init__.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/__init__.py new file mode 100644 index 00000000..84b3bc32 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/__init__.py @@ -0,0 +1,27 @@ +"""Search engine index management package.""" + +from .base import BaseIndexInserter +from .factory import IndexInsertionFactory +from .index_operations import IndexOperations +from .inserters import DatetimeIndexInserter, SimpleIndexInserter +from .managers import DatetimeIndexManager, IndexSizeManager +from .selection import ( + BaseIndexSelector, + DatetimeBasedIndexSelector, + IndexSelectorFactory, + UnfilteredIndexSelector, +) + +__all__ = [ + "BaseIndexInserter", + "BaseIndexSelector", + "IndexOperations", + "IndexSizeManager", + "DatetimeIndexManager", + "DatetimeIndexInserter", + "SimpleIndexInserter", + "IndexInsertionFactory", + "DatetimeBasedIndexSelector", + "UnfilteredIndexSelector", + "IndexSelectorFactory", +] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/base.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/base.py new file mode 100644 index 00000000..46f9c6f5 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/base.py @@ -0,0 +1,51 @@ +"""Base classes for index inserters.""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List + + +class BaseIndexInserter(ABC): + """Base async index inserter with common async methods.""" + + @abstractmethod + async def get_target_index( + self, collection_id: str, product: Dict[str, Any] + ) -> str: + """Get target index for a product asynchronously. + + Args: + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data. + + Returns: + str: Target index name. + """ + pass + + @abstractmethod + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Prepare bulk actions for multiple items asynchronously. + + Args: + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + List[Dict[str, Any]]: List of bulk actions. + """ + pass + + @abstractmethod + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index asynchronously. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + pass diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/factory.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/factory.py new file mode 100644 index 00000000..a69df558 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/factory.py @@ -0,0 +1,36 @@ +"""Factory for creating index insertion strategies.""" + +from typing import Any + +from stac_fastapi.core.utilities import get_bool_env + +from .base import BaseIndexInserter +from .index_operations import IndexOperations +from .inserters import DatetimeIndexInserter, SimpleIndexInserter + + +class IndexInsertionFactory: + """Factory for creating index insertion strategies.""" + + @staticmethod + def create_insertion_strategy( + client: Any, + ) -> BaseIndexInserter: + """Create async insertion strategy based on configuration. + + Args: + client: Async search engine client instance. + + Returns: + BaseIndexInserter: Configured async insertion strategy. + """ + index_operations = IndexOperations() + + use_datetime_partitioning = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + if use_datetime_partitioning: + return DatetimeIndexInserter(client, index_operations) + else: + return SimpleIndexInserter(index_operations, client) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/index_operations.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/index_operations.py new file mode 100644 index 00000000..42028a7a --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/index_operations.py @@ -0,0 +1,167 @@ +"""Search engine adapters for different implementations.""" + +import uuid +from typing import Any, Dict + +from stac_fastapi.sfeos_helpers.database import ( + index_alias_by_collection_id, + index_by_collection_id, +) +from stac_fastapi.sfeos_helpers.mappings import ( + _ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE, + ES_ITEMS_MAPPINGS, + ES_ITEMS_SETTINGS, + ITEMS_INDEX_PREFIX, +) + + +class IndexOperations: + """Base class for search engine adapters with common implementations.""" + + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index for the given collection. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + index_name = f"{index_by_collection_id(collection_id)}-000001" + alias_name = index_alias_by_collection_id(collection_id) + + await client.indices.create( + index=index_name, + body=self._create_index_body({alias_name: {}}), + params={"ignore": [400]}, + ) + return index_name + + async def create_datetime_index( + self, client: Any, collection_id: str, start_date: str + ) -> str: + """Create a datetime-based index for the given collection. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + start_date (str): Start date for the alias. + + Returns: + str: Created index alias name. + """ + index_name = self.create_index_name(collection_id) + alias_name = self.create_alias_name(collection_id, start_date) + collection_alias = index_alias_by_collection_id(collection_id) + await client.indices.create( + index=index_name, + body=self._create_index_body({collection_alias: {}, alias_name: {}}), + ) + return alias_name + + @staticmethod + async def update_index_alias(client: Any, end_date: str, old_alias: str) -> str: + """Update index alias with new end date. + + Args: + client: Search engine client instance. + end_date (str): End date for the alias. + old_alias (str): Current alias name. + + Returns: + str: New alias name. + """ + new_alias = f"{old_alias}-{end_date}" + aliases_info = await client.indices.get_alias(name=old_alias) + actions = [] + + for index_name in aliases_info.keys(): + actions.append({"remove": {"index": index_name, "alias": old_alias}}) + actions.append({"add": {"index": index_name, "alias": new_alias}}) + + await client.indices.update_aliases(body={"actions": actions}) + return new_alias + + @staticmethod + async def change_alias_name(client: Any, old_alias: str, new_alias: str) -> None: + """Change alias name from old to new. + + Args: + client: Search engine client instance. + old_alias (str): Current alias name. + new_alias (str): New alias name. + + Returns: + None + """ + aliases_info = await client.indices.get_alias(name=old_alias) + actions = [] + + for index_name in aliases_info.keys(): + actions.append({"remove": {"index": index_name, "alias": old_alias}}) + actions.append({"add": {"index": index_name, "alias": new_alias}}) + await client.indices.update_aliases(body={"actions": actions}) + + @staticmethod + def create_index_name(collection_id: str) -> str: + """Create index name from collection ID and uuid4. + + Args: + collection_id (str): Collection identifier. + + Returns: + str: Formatted index name. + """ + cleaned = collection_id.translate(_ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE) + return f"{ITEMS_INDEX_PREFIX}{cleaned.lower()}_{uuid.uuid4()}" + + @staticmethod + def create_alias_name(collection_id: str, start_date: str) -> str: + """Create index name from collection ID and uuid4. + + Args: + collection_id (str): Collection identifier. + start_date (str): Start date for the alias. + + Returns: + str: Alias name with initial date. + """ + cleaned = collection_id.translate(_ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE) + return f"{ITEMS_INDEX_PREFIX}{cleaned.lower()}_{start_date}" + + @staticmethod + def _create_index_body(aliases: Dict[str, Dict]) -> Dict[str, Any]: + """Create index body with common settings. + + Args: + aliases (Dict[str, Dict]): Aliases configuration. + + Returns: + Dict[str, Any]: Index body configuration. + """ + return { + "aliases": aliases, + "mappings": ES_ITEMS_MAPPINGS, + "settings": ES_ITEMS_SETTINGS, + } + + @staticmethod + async def find_latest_item_in_index(client: Any, index_name: str) -> dict[str, Any]: + """Find the latest item date in the specified index. + + Args: + client: Search engine client instance. + index_name (str): Name of the index to query. + + Returns: + datetime: Date of the latest item in the index. + """ + query = { + "size": 1, + "sort": [{"properties.datetime": {"order": "desc"}}], + "_source": ["properties.datetime"], + } + + response = await client.search(index=index_name, body=query) + return response["hits"]["hits"][0] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/inserters.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/inserters.py new file mode 100644 index 00000000..06e9c729 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/inserters.py @@ -0,0 +1,309 @@ +"""Async index insertion strategies.""" +import logging +from datetime import timedelta +from typing import Any, Dict, List + +from fastapi import HTTPException, status + +from stac_fastapi.sfeos_helpers.database import ( + extract_date, + extract_first_date_from_index, + index_alias_by_collection_id, + mk_item_id, +) + +from .base import BaseIndexInserter +from .index_operations import IndexOperations +from .managers import DatetimeIndexManager +from .selection import DatetimeBasedIndexSelector + +logger = logging.getLogger(__name__) + + +class DatetimeIndexInserter(BaseIndexInserter): + """Async datetime-based index insertion strategy.""" + + def __init__(self, client: Any, index_operations: IndexOperations): + """Initialize the async datetime index inserter. + + Args: + client: Async search engine client instance. + index_operations (IndexOperations): Search engine adapter instance. + """ + self.client = client + self.index_operations = index_operations + self.datetime_manager = DatetimeIndexManager(client, index_operations) + + @staticmethod + def should_create_collection_index() -> bool: + """Whether this strategy requires collection index creation. + + Returns: + bool: False, as datetime strategy doesn't create collection indexes. + """ + return False + + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index asynchronously. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + return await self.index_operations.create_simple_index(client, collection_id) + + async def get_target_index( + self, collection_id: str, product: Dict[str, Any] + ) -> str: + """Get target index for a single product. + + Args: + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data containing datetime information. + + Returns: + str: Target index name for the product. + """ + index_selector = DatetimeBasedIndexSelector(self.client) + return await self._get_target_index_internal( + index_selector, collection_id, product, check_size=True + ) + + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Prepare bulk actions for multiple items. + + Args: + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + List[Dict[str, Any]]: List of bulk actions ready for execution. + """ + if not items: + msg = "The product list cannot be empty." + logger.error(msg) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=msg) + + items.sort(key=lambda item: item["properties"]["datetime"]) + index_selector = DatetimeBasedIndexSelector(self.client) + + await self._ensure_indexes_exist(index_selector, collection_id, items) + await self._check_and_handle_oversized_index( + index_selector, collection_id, items + ) + + actions = [] + for item in items: + target_index = await self._get_target_index_internal( + index_selector, collection_id, item, check_size=False + ) + actions.append( + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + ) + + return actions + + async def _get_target_index_internal( + self, + index_selector, + collection_id: str, + product: Dict[str, Any], + check_size: bool = True, + ) -> str: + """Get target index with size checking internally. + + Args: + index_selector: Index selector instance. + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data. + check_size (bool): Whetheru to check index size limits. + + Returns: + str: Target index name. + """ + product_datetime = self.datetime_manager.validate_product_datetime(product) + datetime_range = {"gte": product_datetime, "lte": product_datetime} + target_index = await index_selector.select_indexes( + [collection_id], datetime_range + ) + all_indexes = await index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + target_index = await self.datetime_manager.handle_new_collection( + collection_id, product_datetime + ) + await index_selector.refresh_cache() + return target_index + + all_indexes.sort() + start_date = extract_date(product_datetime) + end_date = extract_first_date_from_index(all_indexes[0]) + + if start_date < end_date: + alias = await self.datetime_manager.handle_early_date( + collection_id, start_date, end_date + ) + await index_selector.refresh_cache() + + return alias + + if target_index != all_indexes[-1]: + return target_index + + if check_size and await self.datetime_manager.size_manager.is_index_oversized( + target_index + ): + target_index = await self.datetime_manager.handle_oversized_index( + collection_id, target_index, product_datetime + ) + await index_selector.refresh_cache() + + return target_index + + async def _ensure_indexes_exist( + self, index_selector, collection_id: str, items: List[Dict[str, Any]] + ): + """Ensure necessary indexes exist for the items. + + Args: + index_selector: Index selector instance. + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + """ + all_indexes = await index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + first_item = items[0] + await self.index_operations.create_datetime_index( + self.client, + collection_id, + extract_date(first_item["properties"]["datetime"]), + ) + await index_selector.refresh_cache() + + async def _check_and_handle_oversized_index( + self, index_selector, collection_id: str, items: List[Dict[str, Any]] + ) -> None: + """Check if index is oversized and create new index if needed. + + Checks if the index where the first item would be inserted is oversized. + If so, creates a new index starting from the next day. + + Args: + index_selector: Index selector instance. + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + None + """ + first_item = items[0] + first_item_index = await self._get_target_index_internal( + index_selector, collection_id, first_item, check_size=False + ) + + all_indexes = await index_selector.get_collection_indexes(collection_id) + all_indexes.sort() + latest_index = all_indexes[-1] + + if first_item_index != latest_index: + return None + + if not await self.datetime_manager.size_manager.is_index_oversized( + first_item_index + ): + return None + + latest_item = await self.index_operations.find_latest_item_in_index( + self.client, latest_index + ) + product_datetime = latest_item["_source"]["properties"]["datetime"] + end_date = extract_date(product_datetime) + await self.index_operations.update_index_alias( + self.client, str(end_date), latest_index + ) + next_day_start = end_date + timedelta(days=1) + await self.index_operations.create_datetime_index( + self.client, collection_id, str(next_day_start) + ) + await index_selector.refresh_cache() + + +class SimpleIndexInserter(BaseIndexInserter): + """Simple async index insertion strategy.""" + + def __init__(self, index_operations: IndexOperations, client: Any): + """Initialize the async simple index inserter. + + Args: + index_operations (IndexOperations): Search engine adapter instance. + client: Async search engine client instance. + """ + self.search_adapter = index_operations + self.client = client + + @staticmethod + def should_create_collection_index() -> bool: + """Whether this strategy requires collection index creation. + + Returns: + bool: True, as simple strategy creates collection indexes. + """ + return True + + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index asynchronously. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + return await self.search_adapter.create_simple_index(client, collection_id) + + async def get_target_index( + self, collection_id: str, product: Dict[str, Any] + ) -> str: + """Get target index (always the collection alias). + + Args: + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data (not used in simple strategy). + + Returns: + str: Collection alias name. + """ + return index_alias_by_collection_id(collection_id) + + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Prepare bulk actions for simple indexing. + + Args: + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + List[Dict[str, Any]]: List of bulk actions with collection alias as target. + """ + target_index = index_alias_by_collection_id(collection_id) + return [ + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + for item in items + ] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/managers.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/managers.py new file mode 100644 index 00000000..1194e634 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/managers.py @@ -0,0 +1,198 @@ +"""Index management utilities.""" + +import logging +import os +from datetime import datetime, timedelta +from typing import Any, Dict + +from fastapi import HTTPException, status + +from stac_fastapi.sfeos_helpers.database import ( + extract_date, + extract_first_date_from_index, +) + +from .index_operations import IndexOperations + +logger = logging.getLogger(__name__) + + +class IndexSizeManager: + """Manages index size limits and operations.""" + + def __init__(self, client: Any): + """Initialize the index size manager. + + Args: + client: Search engine client instance. + """ + self.client = client + self.max_size_gb = self._get_max_size_from_env() + + async def get_index_size_in_gb(self, index_name: str) -> float: + """Get index size in gigabytes asynchronously. + + Args: + index_name (str): Name of the index to check. + + Returns: + float: Size of the index in gigabytes. + """ + data = await self.client.indices.stats(index=index_name) + return data["_all"]["primaries"]["store"]["size_in_bytes"] / 1e9 + + async def is_index_oversized(self, index_name: str) -> bool: + """Check if index exceeds size limit asynchronously. + + Args: + index_name (str): Name of the index to check. + + Returns: + bool: True if index exceeds size limit, False otherwise. + """ + size_gb = await self.get_index_size_in_gb(index_name) + is_oversized = size_gb > self.max_size_gb + gb_milestone = int(size_gb) + if gb_milestone > 0: + logger.info(f"Index '{index_name}' size: {gb_milestone}GB") + + if is_oversized: + logger.warning( + f"Index '{index_name}' is oversized: {size_gb:.2f} GB " + f"(limit: {self.max_size_gb} GB)" + ) + + return is_oversized + + @staticmethod + def _get_max_size_from_env() -> float: + """Get max size from environment variable with error handling. + + Returns: + float: Maximum index size in GB. + + Raises: + ValueError: If environment variable contains invalid value. + """ + env_value = os.getenv("DATETIME_INDEX_MAX_SIZE_GB", "25") + + try: + max_size = float(env_value) + if max_size <= 0: + raise ValueError( + f"DATETIME_INDEX_MAX_SIZE_GB must be positive, got: {max_size}" + ) + return max_size + except (ValueError, TypeError): + error_msg = ( + f"Invalid value for DATETIME_INDEX_MAX_SIZE_GB environment variable: " + f"'{env_value}'. Must be a positive number. Using default value 25.0 GB." + ) + logger.warning(error_msg) + + return 25.0 + + +class DatetimeIndexManager: + """Manages datetime-based index operations.""" + + def __init__(self, client: Any, index_operations: IndexOperations): + """Initialize the datetime index manager. + + Args: + client: Search engine client instance. + index_operations (IndexOperations): Search engine adapter instance. + """ + self.client = client + self.index_operations = index_operations + self.size_manager = IndexSizeManager(client) + + @staticmethod + def validate_product_datetime(product: Dict[str, Any]) -> str: + """Validate and extract datetime from product. + + Args: + product (Dict[str, Any]): Product data containing datetime information. + + Returns: + str: Validated product datetime. + + Raises: + HTTPException: If product datetime is missing or invalid. + """ + product_datetime = product["properties"]["datetime"] + if not product_datetime: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Product datetime is required for indexing", + ) + return product_datetime + + async def handle_new_collection( + self, collection_id: str, product_datetime: str + ) -> str: + """Handle index creation for new collection asynchronously. + + Args: + collection_id (str): Collection identifier. + product_datetime (str): Product datetime for index naming. + + + Returns: + str: Created index name. + """ + target_index = await self.index_operations.create_datetime_index( + self.client, collection_id, extract_date(product_datetime) + ) + logger.info( + f"Successfully created index '{target_index}' for collection '{collection_id}'" + ) + return target_index + + async def handle_early_date( + self, collection_id: str, start_date: datetime, end_date: datetime + ) -> str: + """Handle product with date earlier than existing indexes asynchronously. + + Args: + collection_id (str): Collection identifier. + start_date (datetime): Start date for the new index. + end_date (datetime): End date for alias update. + + Returns: + str: Updated alias name. + """ + old_alias = self.index_operations.create_alias_name( + collection_id, str(end_date) + ) + new_alias = self.index_operations.create_alias_name( + collection_id, str(start_date) + ) + await self.index_operations.change_alias_name(self.client, old_alias, new_alias) + return new_alias + + async def handle_oversized_index( + self, collection_id: str, target_index: str, product_datetime: str + ) -> str: + """Handle index that exceeds size limit asynchronously. + + Args: + collection_id (str): Collection identifier. + target_index (str): Current target index name. + product_datetime (str): Product datetime for new index. + + Returns: + str: New or updated index name. + """ + end_date = extract_date(product_datetime) + latest_index_start = extract_first_date_from_index(target_index) + + if end_date != latest_index_start: + await self.index_operations.update_index_alias( + self.client, str(end_date), target_index + ) + target_index = await self.index_operations.create_datetime_index( + self.client, collection_id, str(end_date + timedelta(days=1)) + ) + + return target_index diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/__init__.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/__init__.py new file mode 100644 index 00000000..cf68159c --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/__init__.py @@ -0,0 +1,15 @@ +"""Index selection strategies package.""" + +from .base import BaseIndexSelector +from .cache_manager import IndexAliasLoader, IndexCacheManager +from .factory import IndexSelectorFactory +from .selectors import DatetimeBasedIndexSelector, UnfilteredIndexSelector + +__all__ = [ + "IndexCacheManager", + "IndexAliasLoader", + "DatetimeBasedIndexSelector", + "UnfilteredIndexSelector", + "IndexSelectorFactory", + "BaseIndexSelector", +] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/base.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/base.py new file mode 100644 index 00000000..95f40672 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/base.py @@ -0,0 +1,30 @@ +"""Base classes for index selection strategies.""" + +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + + +class BaseIndexSelector(ABC): + """Base class for async index selectors.""" + + @abstractmethod + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select appropriate indexes asynchronously. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + datetime_search (Dict[str, Optional[str]]): Datetime search criteria. + + Returns: + str: Comma-separated string of selected index names. + """ + pass + + @abstractmethod + async def refresh_cache(self): + """Refresh cache (no-op for unfiltered selector).""" + pass diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/cache_manager.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/cache_manager.py new file mode 100644 index 00000000..3b65244d --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/cache_manager.py @@ -0,0 +1,127 @@ +"""Cache management for index selection strategies.""" + +import threading +import time +from collections import defaultdict +from typing import Any, Dict, List, Optional + +from stac_fastapi.sfeos_helpers.database import index_alias_by_collection_id +from stac_fastapi.sfeos_helpers.mappings import ITEMS_INDEX_PREFIX + + +class IndexCacheManager: + """Manages caching of index aliases with expiration.""" + + def __init__(self, cache_ttl_seconds: int = 3600): + """Initialize the cache manager. + + Args: + cache_ttl_seconds (int): Time-to-live for cache entries in seconds. + """ + self._cache: Optional[Dict[str, List[str]]] = None + self._timestamp: float = 0 + self._ttl = cache_ttl_seconds + self._lock = threading.Lock() + + @property + def is_expired(self) -> bool: + """Check if the cache has expired. + + Returns: + bool: True if cache is expired, False otherwise. + """ + return time.time() - self._timestamp > self._ttl + + def get_cache(self) -> Optional[Dict[str, List[str]]]: + """Get the current cache if not expired. + + Returns: + Optional[Dict[str, List[str]]]: Cache data if valid, None if expired. + """ + with self._lock: + if self.is_expired: + return None + return {k: v.copy() for k, v in self._cache.items()} + + def set_cache(self, data: Dict[str, List[str]]) -> None: + """Set cache data and update timestamp. + + Args: + data (Dict[str, List[str]]): Cache data to store. + """ + self._cache = data + self._timestamp = time.time() + + def clear_cache(self) -> None: + """Clear the cache and reset timestamp.""" + self._cache = None + self._timestamp = 0 + + +class IndexAliasLoader: + """Asynchronous loader for index aliases.""" + + def __init__(self, client: Any, cache_manager: IndexCacheManager): + """Initialize the async alias loader. + + Args: + client: Async search engine client instance. + cache_manager (IndexCacheManager): Cache manager instance. + """ + self.client = client + self.cache_manager = cache_manager + + async def load_aliases(self) -> Dict[str, List[str]]: + """Load index aliases from search engine. + + Returns: + Dict[str, List[str]]: Mapping of base aliases to item aliases. + """ + response = await self.client.indices.get_alias(index=f"{ITEMS_INDEX_PREFIX}*") + result = defaultdict(list) + for index_info in response.values(): + aliases = index_info.get("aliases", {}) + items_aliases = sorted( + [ + alias + for alias in aliases.keys() + if alias.startswith(ITEMS_INDEX_PREFIX) + ] + ) + + if items_aliases: + result[items_aliases[0]].extend(items_aliases[1:]) + + self.cache_manager.set_cache(result) + return result + + async def get_aliases(self) -> Dict[str, List[str]]: + """Get aliases from cache or load if expired. + + Returns: + Dict[str, List[str]]: Alias mapping data. + """ + cached = self.cache_manager.get_cache() + if cached is not None: + return cached + return await self.load_aliases() + + async def refresh_aliases(self) -> Dict[str, List[str]]: + """Force refresh aliases from search engine. + + Returns: + Dict[str, List[str]]: Fresh alias mapping data. + """ + return await self.load_aliases() + + async def get_collection_indexes(self, collection_id: str) -> List[str]: + """Get all index aliases for a specific collection. + + Args: + collection_id (str): Collection identifier. + + Returns: + List[str]: List of index aliases for the collection. + """ + aliases = await self.get_aliases() + return aliases.get(index_alias_by_collection_id(collection_id), []) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/factory.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/factory.py new file mode 100644 index 00000000..4ada945b --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/factory.py @@ -0,0 +1,37 @@ +"""Factory for creating index selection strategies.""" + +from typing import Any + +from stac_fastapi.core.utilities import get_bool_env + +from .base import BaseIndexSelector +from .selectors import DatetimeBasedIndexSelector, UnfilteredIndexSelector + + +class IndexSelectorFactory: + """Factory class for creating index selector instances.""" + + @staticmethod + def create_selector(client: Any) -> BaseIndexSelector: + """Create an appropriate asynchronous index selector based on environment configuration. + + Checks the ENABLE_DATETIME_INDEX_FILTERING environment variable to determine + whether to use datetime-based filtering or return all available indices. + + Args: + client: Asynchronous Elasticsearch/OpenSearch client instance, used only if datetime + filtering is enabled. + + Returns: + IndexSelectionStrategy: Either an AsyncDatetimeBasedIndexSelector if datetime + filtering is enabled, or an UnfilteredIndexSelector otherwise. + """ + use_datetime_filtering = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + return ( + DatetimeBasedIndexSelector(client) + if use_datetime_filtering + else UnfilteredIndexSelector() + ) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/selectors.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/selectors.py new file mode 100644 index 00000000..20f919ab --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/selectors.py @@ -0,0 +1,129 @@ +"""Async index selectors with datetime-based filtering.""" + +from typing import Any, Dict, List, Optional + +from stac_fastapi.sfeos_helpers.database import filter_indexes_by_datetime +from stac_fastapi.sfeos_helpers.mappings import ITEM_INDICES + +from ...database import indices +from .base import BaseIndexSelector +from .cache_manager import IndexAliasLoader, IndexCacheManager + + +class DatetimeBasedIndexSelector(BaseIndexSelector): + """Asynchronous index selector that filters indices based on datetime criteria with caching.""" + + _instance = None + + def __new__(cls, client): + """Create singleton instance. + + Args: + client: Async search engine client instance. + + Returns: + DatetimeBasedIndexSelector: Singleton instance. + """ + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self, client: Any): + """Initialize the datetime-based index selector. + + Args: + client: Elasticsearch/OpenSearch client instance used for querying + index aliases and metadata. + """ + if not hasattr(self, "_initialized"): + self.cache_manager = IndexCacheManager() + self.alias_loader = IndexAliasLoader(client, self.cache_manager) + self._initialized = True + + async def refresh_cache(self) -> Dict[str, List[str]]: + """Force refresh of the aliases cache. + + Returns: + Dict[str, List[str]]: Refreshed dictionary mapping base collection aliases + to lists of their corresponding item index aliases. + """ + return await self.alias_loader.refresh_aliases() + + async def get_collection_indexes(self, collection_id: str) -> List[str]: + """Get all index aliases for a specific collection. + + Args: + collection_id (str): The ID of the collection to retrieve indexes for. + + Returns: + List[str]: List of index aliases associated with the collection. + Returns empty list if collection is not found in cache. + """ + return await self.alias_loader.get_collection_indexes(collection_id) + + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select indexes filtered by collection IDs and datetime criteria. + + For each specified collection, retrieves its associated indexes and filters + them based on datetime range. If no collection IDs are provided, returns + all item indices. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None or empty, returns all item indices. + datetime_search (Dict[str, Optional[str]]): Dictionary containing datetime + search criteria with 'gte' and 'lte' keys for range filtering. + + Returns: + str: Comma-separated string of selected index names that match the + collection and datetime criteria. Returns empty string if no + indexes match the criteria. + """ + if collection_ids: + selected_indexes = [] + for collection_id in collection_ids: + collection_indexes = await self.get_collection_indexes(collection_id) + filtered_indexes = filter_indexes_by_datetime( + collection_indexes, + datetime_search.get("gte"), + datetime_search.get("lte"), + ) + selected_indexes.extend(filtered_indexes) + + return ",".join(selected_indexes) if selected_indexes else "" + + return ITEM_INDICES + + +class UnfilteredIndexSelector(BaseIndexSelector): + """Index selector that returns all available indices without filtering.""" + + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select all indices for given collections without datetime filtering. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None, all collections are considered. + datetime_search (Dict[str, Optional[str]]): Datetime search criteria + (ignored by this implementation). + + Returns: + str: Comma-separated string of all available index names for the collections. + """ + return indices(collection_ids) + + async def refresh_cache(self): + """Refresh cache (no-op for unfiltered selector). + + Note: + Unfiltered selector doesn't use cache, so this is a no-op operation. + """ + pass diff --git a/stac_fastapi/tests/api/test_api.py b/stac_fastapi/tests/api/test_api.py index efc97174..a9de4460 100644 --- a/stac_fastapi/tests/api/test_api.py +++ b/stac_fastapi/tests/api/test_api.py @@ -1,7 +1,9 @@ +import os import random import uuid from copy import deepcopy from datetime import datetime, timedelta +from unittest.mock import patch import pytest @@ -25,6 +27,7 @@ "GET /collections/{collection_id}", "GET /collections/{collection_id}/queryables", "GET /collections/{collection_id}/items", + "POST /collections/{collection_id}/bulk_items", "GET /collections/{collection_id}/items/{item_id}", "GET /search", "POST /search", @@ -427,6 +430,9 @@ async def test_search_point_does_not_intersect(app_client, ctx): @pytest.mark.asyncio async def test_datetime_response_format(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -464,6 +470,9 @@ async def test_datetime_response_format(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_non_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -500,6 +509,9 @@ async def test_datetime_non_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -536,6 +548,9 @@ async def test_datetime_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_bad_non_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -572,6 +587,9 @@ async def test_datetime_bad_non_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_bad_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -823,3 +841,632 @@ async def test_big_int_eo_search( results = {x["properties"][attr] for x in resp_json["features"]} assert len(results) == expected assert results == {value} + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_in_past_date_change_alias_name_for_datetime_index( + app_client, ctx, load_test_data, txn_client +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2012-02-12T12:30:22Z" + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + assert response.status_code == 201 + indices = await txn_client.database.client.indices.get_alias( + index="items_test-collection" + ) + expected_aliases = [ + "items_test-collection_2012-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_uses_existing_datetime_index_for_datetime_index( + app_client, ctx, load_test_data, txn_client +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias( + index="items_test-collection" + ) + expected_aliases = [ + "items_test-collection_2020-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_with_different_date_same_index_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2022-02-12T12:30:22Z" + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias( + index="items_test-collection" + ) + expected_aliases = [ + "items_test-collection_2020-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_new_index_when_size_limit_exceeded_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2024-02-12T12:30:22Z" + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.return_value = 26.0 + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2020-02-12-2024-02-12", + "items_test-collection_2024-02-13", + ] + all_aliases = set() + + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + item_2 = deepcopy(item) + item_2["id"] = str(uuid.uuid4()) + item_2["properties"]["datetime"] = "2023-02-12T12:30:22Z" + response_2 = await app_client.post( + f"/collections/{item_2['collection']}/items", json=item_2 + ) + assert response_2.status_code == 201 + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_fails_without_datetime_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = None + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + assert response.status_code == 400 + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_same_date_range_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + base_item = load_test_data("test_item.json") + items_dict = {} + + for i in range(10): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2020-02-{12 + i}T12:30:22Z" + items_dict[item["id"]] = item + + payload = {"type": "FeatureCollection", "features": list(items_dict.values())} + response = await app_client.post( + f"/collections/{base_item['collection']}/items", json=payload + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2020-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + return all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_different_date_ranges_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + base_item = load_test_data("test_item.json") + items_dict = {} + + for i in range(3): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2020-02-{12 + i}T12:30:22Z" + items_dict[item["id"]] = item + + for i in range(2): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2010-02-{10 + i}T12:30:22Z" + items_dict[item["id"]] = item + + payload = {"type": "FeatureCollection", "features": list(items_dict.values())} + + response = await app_client.post( + f"/collections/{base_item['collection']}/items", json=payload + ) + + assert response.status_code == 201 + indices = await txn_client.database.client.indices.get_alias(index="*") + + expected_aliases = ["items_test-collection_2010-02-10"] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_size_limit_exceeded_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2019-02", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2010-02-10-2020-02-12", + "items_test-collection_2020-02-13", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_early_date_in_second_batch_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2008-01-15-2020-02-12", + "items_test-collection_2020-02-13", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_and_retrieve_by_id_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + response = await app_client.get( + f"/collections/{collection_id}/items/{base_item['id']}" + ) + assert response.status_code == 200 + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_patch_collection_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + patch_data = { + "description": "Updated description via PATCH", + } + response = await app_client.patch( + f"/collections/{collection_id}?refresh=true", json=patch_data + ) + assert response.status_code == 200 + assert response.json()["description"] == "Updated description via PATCH" + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_put_collection_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + collection_response = await app_client.get(f"/collections/{collection_id}") + assert collection_response.status_code == 200 + collection_data = collection_response.json() + + collection_data["description"] = "Updated description via PUT" + collection_data["title"] = "Updated title via PUT" + response = await app_client.put( + f"/collections/{collection_id}?refresh=true", json=collection_data + ) + assert response.json()["description"] == "Updated description via PUT" + assert response.json()["title"] == "Updated title via PUT" + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_patch_item_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + patch_data = {"properties": {"description": "Updated description via PATCH"}} + + response = await app_client.patch( + f"/collections/{collection_id}/items/{base_item['id']}", json=patch_data + ) + assert response.status_code == 200 + assert ( + response.json()["properties"]["description"] + == "Updated description via PATCH" + ) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_put_item_for_datetime_index(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + item_response = await app_client.get( + f"/collections/{collection_id}/items/{base_item['id']}" + ) + assert item_response.status_code == 200 + item_data = item_response.json() + + item_data["properties"]["platform"] = "Updated platform via PUT" + response = await app_client.put( + f"/collections/{collection_id}/items/{base_item['id']}", json=item_data + ) + assert response.json()["properties"]["platform"] == "Updated platform via PUT" diff --git a/stac_fastapi/tests/conftest.py b/stac_fastapi/tests/conftest.py index d8c5fc88..23da2668 100644 --- a/stac_fastapi/tests/conftest.py +++ b/stac_fastapi/tests/conftest.py @@ -26,6 +26,7 @@ from stac_fastapi.core.rate_limit import setup_rate_limit from stac_fastapi.core.utilities import get_bool_env from stac_fastapi.sfeos_helpers.aggregation import EsAsyncBaseAggregationClient +from stac_fastapi.sfeos_helpers.mappings import ITEMS_INDEX_PREFIX if os.getenv("BACKEND", "elasticsearch").lower() == "opensearch": from stac_fastapi.opensearch.app import app_config @@ -158,6 +159,8 @@ async def delete_collections_and_items(txn_client: TransactionsClient) -> None: await refresh_indices(txn_client) await txn_client.database.delete_items() await txn_client.database.delete_collections() + await txn_client.database.client.indices.delete(index=f"{ITEMS_INDEX_PREFIX}*") + await txn_client.database.async_index_selector.refresh_cache() async def refresh_indices(txn_client: TransactionsClient) -> None: diff --git a/stac_fastapi/tests/database/test_database.py b/stac_fastapi/tests/database/test_database.py index 86611235..67897c15 100644 --- a/stac_fastapi/tests/database/test_database.py +++ b/stac_fastapi/tests/database/test_database.py @@ -1,3 +1,4 @@ +import os import uuid import pytest @@ -27,6 +28,9 @@ async def test_index_mapping_collections(ctx): @pytest.mark.asyncio async def test_index_mapping_items(txn_client, load_test_data): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + collection = load_test_data("test_collection.json") collection["id"] = str(uuid.uuid4()) await txn_client.create_collection( diff --git a/stac_fastapi/tests/resources/test_item.py b/stac_fastapi/tests/resources/test_item.py index 0102bf9b..0299cdc0 100644 --- a/stac_fastapi/tests/resources/test_item.py +++ b/stac_fastapi/tests/resources/test_item.py @@ -114,8 +114,15 @@ async def test_create_uppercase_collection_with_item( async def test_update_item_already_exists(app_client, ctx, load_test_data): """Test updating an item which already exists (transactions extension)""" item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) assert item["properties"]["gsd"] != 16 item["properties"]["gsd"] = 16 + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + assert response.status_code == 201 + await app_client.put( f"/collections/{item['collection']}/items/{item['id']}", json=item ) @@ -998,6 +1005,9 @@ async def _search_and_get_ids( async def test_search_datetime_with_null_datetime( app_client, txn_client, load_test_data ): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + """Test datetime filtering when properties.datetime is null or set, ensuring start_datetime and end_datetime are set when datetime is null.""" # Setup: Create test collection test_collection = load_test_data("test_collection.json") diff --git a/tox.ini b/tox.ini index 100ee64c..546c7767 100644 --- a/tox.ini +++ b/tox.ini @@ -13,4 +13,8 @@ max-line-length = 90 profile=black known_first_party = stac_fastapi known_third_party = rasterio,stac-pydantic,sqlalchemy,geoalchemy2,fastapi -sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER \ No newline at end of file +sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER + +[tool:pytest] +markers = + datetime_filtering: tests that require ENABLE_DATETIME_INDEX_FILTERING=true \ No newline at end of file