From 590ccb327eecbd1cecd5f396d0d06468c0233a2d Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Thu, 7 Aug 2025 17:28:08 +0200 Subject: [PATCH 01/11] ready for code review --- .github/workflows/cicd.yml | 3 + CHANGELOG.md | 26 + Makefile | 15 +- README.md | 49 +- compose.yml | 3 + stac_fastapi/core/stac_fastapi/core/core.py | 23 +- .../core/stac_fastapi/core/datetime_utils.py | 1 + .../core/stac_fastapi/core/serializers.py | 1 + .../core/stac_fastapi/core/session.py | 1 + .../elasticsearch/database_logic.py | 134 ++-- .../stac_fastapi/opensearch/config.py | 1 + .../stac_fastapi/opensearch/database_logic.py | 150 ++-- .../sfeos_helpers/aggregation/client.py | 5 +- .../sfeos_helpers/database/__init__.py | 6 +- .../sfeos_helpers/database/datetime.py | 67 +- .../sfeos_helpers/database/index.py | 61 +- .../sfeos_helpers/search_engine/__init__.py | 27 + .../sfeos_helpers/search_engine/base.py | 51 ++ .../sfeos_helpers/search_engine/factory.py | 36 + .../search_engine/index_operations.py | 167 +++++ .../sfeos_helpers/search_engine/inserters.py | 309 +++++++++ .../sfeos_helpers/search_engine/managers.py | 198 ++++++ .../search_engine/selection/__init__.py | 15 + .../search_engine/selection/base.py | 30 + .../search_engine/selection/cache_manager.py | 127 ++++ .../search_engine/selection/factory.py | 37 + .../search_engine/selection/selectors.py | 129 ++++ stac_fastapi/tests/api/test_api.py | 647 ++++++++++++++++++ stac_fastapi/tests/conftest.py | 3 + stac_fastapi/tests/database/test_database.py | 4 + stac_fastapi/tests/resources/test_item.py | 10 + tox.ini | 6 +- 32 files changed, 2189 insertions(+), 153 deletions(-) create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/__init__.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/base.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/factory.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/index_operations.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/inserters.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/managers.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/__init__.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/base.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/cache_manager.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/factory.py create mode 100644 stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/selectors.py diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index abf6ebfa..ae34d115 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -28,6 +28,7 @@ jobs: xpack.security.enabled: false xpack.security.transport.ssl.enabled: false ES_JAVA_OPTS: -Xms512m -Xmx1g + action.destructive_requires_name: false ports: - 9200:9200 @@ -44,6 +45,7 @@ jobs: xpack.security.enabled: false xpack.security.transport.ssl.enabled: false ES_JAVA_OPTS: -Xms512m -Xmx1g + action.destructive_requires_name: false ports: - 9400:9400 @@ -60,6 +62,7 @@ jobs: plugins.security.disabled: true plugins.security.ssl.http.enabled: true OPENSEARCH_JAVA_OPTS: -Xms512m -Xmx512m + action.destructive_requires_name: false ports: - 9202:9202 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ed76fcc..8f820245 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,32 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Added + +- Added comprehensive index management system with dynamic selection and insertion strategies for improved performance and scalability [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added `ENABLE_DATETIME_INDEX_FILTERING` environment variable to enable datetime-based index selection using collection IDs. Requires indexes in format: `STAC_ITEMS_INDEX_PREFIX_collection-id_start_year-start_month-start_day-end_year-end_month-end_day`, e.g. `items_sentinel-2-l2a_2025-06-06-2025-09-22`. Default is `false`. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added `DATETIME_INDEX_MAX_SIZE_GB` environment variable to set maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Default is `25` GB. Only applies when `ENABLE_DATETIME_INDEX_FILTERING` is enabled. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added search engine adapter system with support for both Elasticsearch and OpenSearch [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - `SearchEngineAdapter` base class with engine-specific implementations + - `ElasticsearchAdapter` and `OpenSearchAdapter` with tailored index creation methods + - Automatic engine type detection based on client class + - `SearchEngineAdapterFactory` for creating appropriate adapters +- Added datetime-based index selection strategies with caching support [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - `AsyncDatetimeBasedIndexSelector` and `SyncDatetimeBasedIndexSelector` for temporal filtering + - `IndexCacheManager` with configurable TTL-based cache expiration (default 1 hour) + - `AsyncIndexAliasLoader` and `SyncIndexAliasLoader` for alias management + - `UnfilteredIndexSelector` as fallback for returning all available indexes +- Added index insertion strategies with automatic partitioning [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - Simple insertion strategy (`AsyncSimpleIndexInserter`, `SyncSimpleIndexInserter`) for traditional single-index-per-collection approach + - Datetime-based insertion strategy (`AsyncDatetimeIndexInserter`, `SyncDatetimeIndexInserter`) with time-based partitioning + - Automatic index size monitoring and splitting when limits exceeded + - Handling of chronologically early data and bulk operations +- Added index management utilities [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - `IndexSizeManager` for size monitoring and overflow handling + - `DatetimeIndexManager` for datetime-based index operations + - Factory patterns (`IndexInsertionFactory`, `IndexSelectorFactory`) for strategy creation based on configuration + + ## [v6.1.0] - 2025-07-24 ### Added diff --git a/Makefile b/Makefile index c23ca951..204b31a1 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ run_os = docker compose \ .PHONY: image-deploy-es image-deploy-es: docker build -f dockerfiles/Dockerfile.dev.es -t stac-fastapi-elasticsearch:latest . - + .PHONY: image-deploy-os image-deploy-os: docker build -f dockerfiles/Dockerfile.dev.os -t stac-fastapi-opensearch:latest . @@ -71,14 +71,19 @@ test-opensearch: -$(run_os) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest' docker compose down -.PHONY: test -test: - -$(run_es) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh elasticsearch:9200 && cd stac_fastapi/tests/ && pytest --cov=stac_fastapi --cov-report=term-missing' +.PHONY: test-datetime-filtering-es +test-datetime-filtering-es: + -$(run_es) /bin/bash -c 'export ENABLE_DATETIME_INDEX_FILTERING=true && ./scripts/wait-for-it-es.sh elasticsearch:9200 && cd stac_fastapi/tests/ && pytest -s --cov=stac_fastapi --cov-report=term-missing -m datetime_filtering' docker compose down - -$(run_os) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest --cov=stac_fastapi --cov-report=term-missing' +.PHONY: test-datetime-filtering-os +test-datetime-filtering-os: + -$(run_os) /bin/bash -c 'export ENABLE_DATETIME_INDEX_FILTERING=true && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest -s --cov=stac_fastapi --cov-report=term-missing -m datetime_filtering' docker compose down +.PHONY: test +test: test-elasticsearch test-datetime-filtering-es test-opensearch test-datetime-filtering-os + .PHONY: run-database-es run-database-es: docker compose run --rm elasticsearch diff --git a/README.md b/README.md index 9e5a4674..9ba41ed8 100644 --- a/README.md +++ b/README.md @@ -201,31 +201,32 @@ There are two main ways to run the API locally: You can customize additional settings in your `.env` file: -| Variable | Description | Default | Required | -|------------------------------|--------------------------------------------------------------------------------------|--------------------------|---------------------------------------------------------------------------------------------| -| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | -| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS)| Optional | -| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `true` | Optional | -| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `true` | Optional | -| `ES_API_KEY` | API Key for external Elasticsearch/OpenSearch. | N/A | Optional | +| Variable | Description | Default | Required | +|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|---------------------------------------------------------------------------------------------| +| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | +| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS) | Optional | +| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `false` | Optional | +| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `false` | Optional | | `ES_TIMEOUT` | Client timeout for Elasticsearch/OpenSearch. | DB client default | Optional | -| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | -| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | -| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | -| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | -| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | -| `APP_PORT` | Server port. | `8000` | Optional | -| `ENVIRONMENT` | Runtime environment. | `local` | Optional | -| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | -| `RELOAD` | Enable auto-reload for development. | `true` | Optional | -| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | -| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | -| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | -| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional | -| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional | -| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` | Optional | -| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | -| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | +| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | +| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | +| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | +| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | +| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | +| `APP_PORT` | Server port. | `8080` | Optional | +| `ENVIRONMENT` | Runtime environment. | `local` | Optional | +| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | +| `RELOAD` | Enable auto-reload for development. | `true` | Optional | +| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | +| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | +| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | | +| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional +| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional +| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` Optional | +| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | +| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | +| `ENABLE_DATETIME_INDEX_FILTERING` | Enable datetime-based index selection using collection IDs. Requires indexes in format: STAC_ITEMS_INDEX_PREFIX_collection-id_start_year-start_month-start_day-end_year-end_month-end_day, e.g. items_sentinel-2-l2a_2025-06-06-2025-09-22. | `false` | Optional | +| `DATETIME_INDEX_MAX_SIZE_GB` | Maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Note: This value should account for ~25% overhead due to OS/ES caching of data structures and metadata. Only applies when`ENABLE_DATETIME_INDEX_FILTERING` is enabled. | `25` | Optional | > [!NOTE] > The variables `ES_HOST`, `ES_PORT`, `ES_USE_SSL`, `ES_VERIFY_CERTS` and `ES_TIMEOUT` apply to both Elasticsearch and OpenSearch backends, so there is no need to rename the key names to `OS_` even if you're using OpenSearch. diff --git a/compose.yml b/compose.yml index 05665595..ba898bb1 100644 --- a/compose.yml +++ b/compose.yml @@ -21,6 +21,7 @@ services: - ES_USE_SSL=false - ES_VERIFY_CERTS=false - BACKEND=elasticsearch + - DATABASE_REFRESH=true ports: - "8080:8080" volumes: @@ -72,6 +73,7 @@ services: hostname: elasticsearch environment: ES_JAVA_OPTS: -Xms512m -Xmx1g + action.destructive_requires_name: false volumes: - ./elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml - ./elasticsearch/snapshots:/usr/share/elasticsearch/snapshots @@ -86,6 +88,7 @@ services: - discovery.type=single-node - plugins.security.disabled=true - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m + - action.destructive_requires_name=false volumes: - ./opensearch/config/opensearch.yml:/usr/share/opensearch/config/opensearch.yml - ./opensearch/snapshots:/usr/share/opensearch/snapshots diff --git a/stac_fastapi/core/stac_fastapi/core/core.py b/stac_fastapi/core/stac_fastapi/core/core.py index 8d1f472b..1fde5bd3 100644 --- a/stac_fastapi/core/stac_fastapi/core/core.py +++ b/stac_fastapi/core/stac_fastapi/core/core.py @@ -37,6 +37,7 @@ BulkTransactionMethod, Items, ) +from stac_fastapi.sfeos_helpers.database import return_date from stac_fastapi.types import stac as stac_types from stac_fastapi.types.conformance import BASE_CONFORMANCE_CLASSES from stac_fastapi.types.core import AsyncBaseCoreClient @@ -324,10 +325,16 @@ async def item_collection( search=search, collection_ids=[collection_id] ) - if datetime: + try: + datetime_search = return_date(datetime) search = self.database.apply_datetime_filter( - search=search, interval=datetime + search=search, datetime_search=datetime_search ) + except (ValueError, TypeError) as e: + # Handle invalid interval formats if return_date fails + msg = f"Invalid interval format: {datetime}, error: {e}" + logger.error(msg) + raise HTTPException(status_code=400, detail=msg) if bbox: bbox = [float(x) for x in bbox] @@ -342,6 +349,7 @@ async def item_collection( sort=None, token=token, collection_ids=[collection_id], + datetime_search=datetime_search, ) items = [ @@ -500,10 +508,16 @@ async def post_search( search=search, collection_ids=search_request.collections ) - if search_request.datetime: + try: + datetime_search = return_date(search_request.datetime) search = self.database.apply_datetime_filter( - search=search, interval=search_request.datetime + search=search, datetime_search=datetime_search ) + except (ValueError, TypeError) as e: + # Handle invalid interval formats if return_date fails + msg = f"Invalid interval format: {search_request.datetime}, error: {e}" + logger.error(msg) + raise HTTPException(status_code=400, detail=msg) if search_request.bbox: bbox = search_request.bbox @@ -560,6 +574,7 @@ async def post_search( token=search_request.token, sort=sort, collection_ids=search_request.collections, + datetime_search=datetime_search, ) fields = ( diff --git a/stac_fastapi/core/stac_fastapi/core/datetime_utils.py b/stac_fastapi/core/stac_fastapi/core/datetime_utils.py index f9dbacf5..87911ac5 100644 --- a/stac_fastapi/core/stac_fastapi/core/datetime_utils.py +++ b/stac_fastapi/core/stac_fastapi/core/datetime_utils.py @@ -1,4 +1,5 @@ """Utility functions to handle datetime parsing.""" + from datetime import datetime, timezone from stac_fastapi.types.rfc3339 import rfc3339_str_to_datetime diff --git a/stac_fastapi/core/stac_fastapi/core/serializers.py b/stac_fastapi/core/stac_fastapi/core/serializers.py index 9b0d36d4..d537b493 100644 --- a/stac_fastapi/core/stac_fastapi/core/serializers.py +++ b/stac_fastapi/core/stac_fastapi/core/serializers.py @@ -1,4 +1,5 @@ """Serializers.""" + import abc from copy import deepcopy from typing import Any, List, Optional diff --git a/stac_fastapi/core/stac_fastapi/core/session.py b/stac_fastapi/core/stac_fastapi/core/session.py index d5a7aa3c..990f9d73 100644 --- a/stac_fastapi/core/stac_fastapi/core/session.py +++ b/stac_fastapi/core/stac_fastapi/core/session.py @@ -1,4 +1,5 @@ """database session management.""" + import logging import attr diff --git a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py index 16a8a83d..46766386 100644 --- a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py +++ b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py @@ -4,7 +4,7 @@ import logging from base64 import urlsafe_b64decode, urlsafe_b64encode from copy import deepcopy -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type import attr import elasticsearch.helpers as helpers @@ -27,7 +27,7 @@ PartialItem, PatchOperation, ) -from stac_fastapi.sfeos_helpers import filter +from stac_fastapi.sfeos_helpers import filter as filter_module from stac_fastapi.sfeos_helpers.database import ( apply_free_text_filter_shared, apply_intersects_filter_shared, @@ -36,11 +36,9 @@ get_queryables_mapping_shared, index_alias_by_collection_id, index_by_collection_id, - indices, mk_actions, mk_item_id, populate_sort_shared, - return_date, validate_refresh, ) from stac_fastapi.sfeos_helpers.database.query import ( @@ -59,9 +57,14 @@ ITEMS_INDEX_PREFIX, Geometry, ) +from stac_fastapi.sfeos_helpers.search_engine import ( + BaseIndexInserter, + BaseIndexSelector, + IndexInsertionFactory, + IndexSelectorFactory, +) from stac_fastapi.types.errors import ConflictError, NotFoundError from stac_fastapi.types.links import resolve_links -from stac_fastapi.types.rfc3339 import DateTimeType from stac_fastapi.types.stac import Collection, Item logger = logging.getLogger(__name__) @@ -139,6 +142,8 @@ class DatabaseLogic(BaseDatabaseLogic): sync_settings: SyncElasticsearchSettings = attr.ib( factory=SyncElasticsearchSettings ) + async_index_selector: BaseIndexSelector = attr.ib(init=False) + async_index_inserter: BaseIndexInserter = attr.ib(init=False) client = attr.ib(init=False) sync_client = attr.ib(init=False) @@ -147,6 +152,10 @@ def __attrs_post_init__(self): """Initialize clients after the class is instantiated.""" self.client = self.async_settings.create_client self.sync_client = self.sync_settings.create_client + self.async_index_inserter = IndexInsertionFactory.create_insertion_strategy( + self.client + ) + self.async_index_selector = IndexSelectorFactory.create_selector(self.client) item_serializer: Type[ItemSerializer] = attr.ib(default=ItemSerializer) collection_serializer: Type[CollectionSerializer] = attr.ib( @@ -216,15 +225,23 @@ async def get_one_item(self, collection_id: str, item_id: str) -> Dict: with the index for the Collection as the target index and the combined `mk_item_id` as the document id. """ try: - item = await self.client.get( + response = await self.client.search( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, ) + if response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + + return response["hits"]["hits"][0]["_source"] except ESNotFoundError: raise NotFoundError( f"Item {item_id} does not exist inside Collection {collection_id}" ) - return item["_source"] async def get_queryables_mapping(self, collection_id: str = "*") -> dict: """Retrieve mapping of Queryables for search. @@ -260,30 +277,18 @@ def apply_collections_filter(search: Search, collection_ids: List[str]): @staticmethod def apply_datetime_filter( - search: Search, interval: Optional[Union[DateTimeType, str]] + search: Search, datetime_search: Dict[str, Optional[str]] ) -> Search: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - interval: Optional datetime interval to filter by. Can be: - - A single datetime string (e.g., "2023-01-01T12:00:00") - - A datetime range string (e.g., "2023-01-01/2023-12-31") - - A datetime object - - A tuple of (start_datetime, end_datetime) + datetime_search: Dict[str, Optional[str]] Returns: The filtered search object. """ - if not interval: - return search - - should = [] - try: - datetime_search = return_date(interval) - except (ValueError, TypeError) as e: - # Handle invalid interval formats if return_date fails - logger.error(f"Invalid interval format: {interval}, error: {e}") + if not datetime_search: return search if "eq" in datetime_search: @@ -466,7 +471,7 @@ async def apply_cql2_filter( otherwise the original Search object. """ if _filter is not None: - es_query = filter.to_es(await self.get_queryables_mapping(), _filter) + es_query = filter_module.to_es(await self.get_queryables_mapping(), _filter) search = search.query(es_query) return search @@ -493,6 +498,7 @@ async def execute_search( token: Optional[str], sort: Optional[Dict[str, Dict[str, str]]], collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], ignore_unavailable: bool = True, ) -> Tuple[Iterable[Dict[str, Any]], Optional[int], Optional[str]]: """Execute a search query with limit and other optional parameters. @@ -503,6 +509,7 @@ async def execute_search( token (Optional[str]): The token used to return the next set of results. sort (Optional[Dict[str, Dict[str, str]]]): Specifies how the results should be sorted. collection_ids (Optional[List[str]]): The collection ids to search. + datetime_search (Dict[str, Optional[str]]): Datetime range used for index selection. ignore_unavailable (bool, optional): Whether to ignore unavailable collections. Defaults to True. Returns: @@ -523,7 +530,9 @@ async def execute_search( query = search.query.to_dict() if search.query else None - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) if len(index_param) > ES_MAX_URL_LENGTH - 300: index_param = ITEM_INDICES query = add_collections_to_body(collection_ids, query) @@ -590,6 +599,7 @@ async def aggregate( geometry_geohash_grid_precision: int, geometry_geotile_grid_precision: int, datetime_frequency_interval: str, + datetime_search, ignore_unavailable: Optional[bool] = True, ): """Return aggregations of STAC Items.""" @@ -625,7 +635,10 @@ def _fill_aggregation_parameters(name: str, agg: dict) -> dict: if k in aggregations } - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) + search_task = asyncio.create_task( self.client.search( index=index_param, @@ -667,14 +680,21 @@ async def async_prep_create_item( """ await self.check_collection_exists(collection_id=item["collection"]) + alias = index_alias_by_collection_id(item["collection"]) + doc_id = mk_item_id(item["id"], item["collection"]) - if not exist_ok and await self.client.exists( - index=index_alias_by_collection_id(item["collection"]), - id=mk_item_id(item["id"], item["collection"]), - ): - raise ConflictError( - f"Item {item['id']} in collection {item['collection']} already exists" - ) + if not exist_ok: + alias_exists = await self.client.indices.exists_alias(name=alias) + + if alias_exists: + alias_info = await self.client.indices.get_alias(name=alias) + indices = list(alias_info.keys()) + + for index in indices: + if await self.client.exists(index=index, id=doc_id): + raise ConflictError( + f"Item {item['id']} in collection {item['collection']} already exists" + ) return self.item_serializer.stac_to_db(item, base_url) @@ -805,7 +825,6 @@ async def create_item( # Extract item and collection IDs item_id = item["id"] collection_id = item["collection"] - # Ensure kwargs is a dictionary kwargs = kwargs or {} @@ -823,9 +842,12 @@ async def create_item( item=item, base_url=base_url, exist_ok=exist_ok ) + target_index = await self.async_index_inserter.get_target_index( + collection_id, item + ) # Index the item in the database await self.client.index( - index=index_alias_by_collection_id(collection_id), + index=target_index, id=mk_item_id(item_id, collection_id), document=item, refresh=refresh, @@ -904,13 +926,28 @@ async def json_patch_item( script = operations_to_script(script_operations) try: - await self.client.update( + search_response = await self.client.search( index=index_alias_by_collection_id(collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, + ) + if search_response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + document_index = search_response["hits"]["hits"][0]["_index"] + await self.client.update( + index=document_index, id=mk_item_id(item_id, collection_id), script=script, refresh=True, ) - + except ESNotFoundError: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) except BadRequestError as exc: raise HTTPException( status_code=400, detail=exc.info["error"]["caused_by"] @@ -921,7 +958,9 @@ async def json_patch_item( if new_collection_id: await self.client.reindex( body={ - "dest": {"index": f"{ITEMS_INDEX_PREFIX}{new_collection_id}"}, + "dest": { + "index": f"{ITEMS_INDEX_PREFIX}{new_collection_id}" + }, # # noqa "source": { "index": f"{ITEMS_INDEX_PREFIX}{collection_id}", "query": {"term": {"id": {"value": item_id}}}, @@ -929,8 +968,8 @@ async def json_patch_item( "script": { "lang": "painless", "source": ( - f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" - f"""ctx._source.collection = '{new_collection_id}';""" + f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" # noqa + f"""ctx._source.collection = '{new_collection_id}';""" # noqa ), }, }, @@ -990,9 +1029,9 @@ async def delete_item(self, item_id: str, collection_id: str, **kwargs: Any): try: # Perform the delete operation - await self.client.delete( + await self.client.delete_by_query( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={"query": {"term": {"_id": mk_item_id(item_id, collection_id)}}}, refresh=refresh, ) except ESNotFoundError: @@ -1092,8 +1131,10 @@ async def create_collection(self, collection: Collection, **kwargs: Any): refresh=refresh, ) - # Create the item index for the collection - await create_item_index(collection_id) + if self.async_index_inserter.should_create_collection_index(): + await self.async_index_inserter.create_simple_index( + self.client, collection_id + ) async def find_collection(self, collection_id: str) -> Collection: """Find and return a collection from the database. @@ -1367,9 +1408,12 @@ async def bulk_async( # Perform the bulk insert raise_on_error = self.async_settings.raise_on_bulk_error + actions = await self.async_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) success, errors = await helpers.async_bulk( self.client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py index 08e9a42a..ec8fb90b 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/config.py @@ -1,4 +1,5 @@ """API configuration.""" + import logging import os import ssl diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index c323b307..83ec6821 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -4,7 +4,7 @@ import logging from base64 import urlsafe_b64decode, urlsafe_b64encode from copy import deepcopy -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type import attr import orjson @@ -26,7 +26,6 @@ AsyncOpensearchSettings as AsyncSearchSettings, ) from stac_fastapi.opensearch.config import OpensearchSettings as SyncSearchSettings -from stac_fastapi.sfeos_helpers import filter from stac_fastapi.sfeos_helpers.database import ( apply_free_text_filter_shared, apply_intersects_filter_shared, @@ -34,12 +33,9 @@ delete_item_index_shared, get_queryables_mapping_shared, index_alias_by_collection_id, - index_by_collection_id, - indices, mk_actions, mk_item_id, populate_sort_shared, - return_date, validate_refresh, ) from stac_fastapi.sfeos_helpers.database.query import ( @@ -50,20 +46,24 @@ merge_to_operations, operations_to_script, ) +from stac_fastapi.sfeos_helpers.filter import filter as filter_module from stac_fastapi.sfeos_helpers.mappings import ( AGGREGATION_MAPPING, COLLECTIONS_INDEX, DEFAULT_SORT, ES_COLLECTIONS_MAPPINGS, - ES_ITEMS_MAPPINGS, - ES_ITEMS_SETTINGS, ITEM_INDICES, ITEMS_INDEX_PREFIX, Geometry, ) +from stac_fastapi.sfeos_helpers.search_engine import ( + BaseIndexInserter, + BaseIndexSelector, + IndexInsertionFactory, + IndexSelectorFactory, +) from stac_fastapi.types.errors import ConflictError, NotFoundError from stac_fastapi.types.links import resolve_links -from stac_fastapi.types.rfc3339 import DateTimeType from stac_fastapi.types.stac import Collection, Item logger = logging.getLogger(__name__) @@ -104,33 +104,6 @@ async def create_collection_index() -> None: await client.close() -async def create_item_index(collection_id: str) -> None: - """ - Create the index for Items. The settings of the index template will be used implicitly. - - Args: - collection_id (str): Collection identifier. - - Returns: - None - - """ - client = AsyncSearchSettings().create_client - - index_name = f"{index_by_collection_id(collection_id)}-000001" - exists = await client.indices.exists(index=index_name) - if not exists: - await client.indices.create( - index=index_name, - body={ - "aliases": {index_alias_by_collection_id(collection_id): {}}, - "mappings": ES_ITEMS_MAPPINGS, - "settings": ES_ITEMS_SETTINGS, - }, - ) - await client.close() - - async def delete_item_index(collection_id: str) -> None: """Delete the index for items in a collection. @@ -152,6 +125,9 @@ class DatabaseLogic(BaseDatabaseLogic): async_settings: AsyncSearchSettings = attr.ib(factory=AsyncSearchSettings) sync_settings: SyncSearchSettings = attr.ib(factory=SyncSearchSettings) + async_index_selector: BaseIndexSelector = attr.ib(init=False) + async_index_inserter: BaseIndexInserter = attr.ib(init=False) + client = attr.ib(init=False) sync_client = attr.ib(init=False) @@ -159,6 +135,10 @@ def __attrs_post_init__(self): """Initialize clients after the class is instantiated.""" self.client = self.async_settings.create_client self.sync_client = self.sync_settings.create_client + self.async_index_inserter = IndexInsertionFactory.create_insertion_strategy( + self.client + ) + self.async_index_selector = IndexSelectorFactory.create_selector(self.client) item_serializer: Type[ItemSerializer] = attr.ib(default=ItemSerializer) collection_serializer: Type[CollectionSerializer] = attr.ib( @@ -234,15 +214,23 @@ async def get_one_item(self, collection_id: str, item_id: str) -> Dict: with the index for the Collection as the target index and the combined `mk_item_id` as the document id. """ try: - item = await self.client.get( + response = await self.client.search( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, ) + if response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + + return response["hits"]["hits"][0]["_source"] except exceptions.NotFoundError: raise NotFoundError( f"Item {item_id} does not exist inside Collection {collection_id}" ) - return item["_source"] async def get_queryables_mapping(self, collection_id: str = "*") -> dict: """Retrieve mapping of Queryables for search. @@ -296,30 +284,18 @@ def apply_free_text_filter(search: Search, free_text_queries: Optional[List[str] @staticmethod def apply_datetime_filter( - search: Search, interval: Optional[Union[DateTimeType, str]] + search: Search, datetime_search: Dict[str, Optional[str]] ) -> Search: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - interval: Optional datetime interval to filter by. Can be: - - A single datetime string (e.g., "2023-01-01T12:00:00") - - A datetime range string (e.g., "2023-01-01/2023-12-31") - - A datetime object - - A tuple of (start_datetime, end_datetime) + datetime_search: Dict[str, Optional[str]] Returns: The filtered search object. """ - if not interval: - return search - - should = [] - try: - datetime_search = return_date(interval) - except (ValueError, TypeError) as e: - # Handle invalid interval formats if return_date fails - logger.error(f"Invalid interval format: {interval}, error: {e}") + if not datetime_search: return search if "eq" in datetime_search: @@ -484,7 +460,7 @@ async def apply_cql2_filter( otherwise the original Search object. """ if _filter is not None: - es_query = filter.to_es(await self.get_queryables_mapping(), _filter) + es_query = filter_module.to_es(await self.get_queryables_mapping(), _filter) search = search.filter(es_query) return search @@ -511,6 +487,7 @@ async def execute_search( token: Optional[str], sort: Optional[Dict[str, Dict[str, str]]], collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], ignore_unavailable: bool = True, ) -> Tuple[Iterable[Dict[str, Any]], Optional[int], Optional[str]]: """Execute a search query with limit and other optional parameters. @@ -521,6 +498,7 @@ async def execute_search( token (Optional[str]): The token used to return the next set of results. sort (Optional[Dict[str, Dict[str, str]]]): Specifies how the results should be sorted. collection_ids (Optional[List[str]]): The collection ids to search. + datetime_search (Dict[str, Optional[str]]): Datetime range used for index selection. ignore_unavailable (bool, optional): Whether to ignore unavailable collections. Defaults to True. Returns: @@ -537,7 +515,9 @@ async def execute_search( search_body: Dict[str, Any] = {} query = search.query.to_dict() if search.query else None - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) if len(index_param) > ES_MAX_URL_LENGTH - 300: index_param = ITEM_INDICES query = add_collections_to_body(collection_ids, query) @@ -614,6 +594,7 @@ async def aggregate( geometry_geohash_grid_precision: int, geometry_geotile_grid_precision: int, datetime_frequency_interval: str, + datetime_search, ignore_unavailable: Optional[bool] = True, ): """Return aggregations of STAC Items.""" @@ -647,7 +628,10 @@ def _fill_aggregation_parameters(name: str, agg: dict) -> dict: if k in aggregations } - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) + search_task = asyncio.create_task( self.client.search( index=index_param, @@ -840,8 +824,13 @@ async def create_item( item = await self.async_prep_create_item( item=item, base_url=base_url, exist_ok=exist_ok ) + + target_index = await self.async_index_inserter.get_target_index( + collection_id, item + ) + await self.client.index( - index=index_alias_by_collection_id(collection_id), + index=target_index, id=mk_item_id(item_id, collection_id), body=item, refresh=refresh, @@ -920,13 +909,28 @@ async def json_patch_item( script = operations_to_script(script_operations) try: - await self.client.update( + search_response = await self.client.search( index=index_alias_by_collection_id(collection_id), + body={ + "query": {"term": {"_id": mk_item_id(item_id, collection_id)}}, + "size": 1, + }, + ) + if search_response["hits"]["total"]["value"] == 0: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) + document_index = search_response["hits"]["hits"][0]["_index"] + await self.client.update( + index=document_index, id=mk_item_id(item_id, collection_id), - body={"script": script}, + script=script, refresh=True, ) - + except exceptions.NotFoundError: + raise NotFoundError( + f"Item {item_id} does not exist inside Collection {collection_id}" + ) except exceptions.RequestError as exc: raise HTTPException( status_code=400, detail=exc.info["error"]["caused_by"] @@ -945,8 +949,8 @@ async def json_patch_item( "script": { "lang": "painless", "source": ( - f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" - f"""ctx._source.collection = '{new_collection_id}';""" + f"""ctx._id = ctx._id.replace('{collection_id}', '{new_collection_id}');""" # noqa: E702 + f"""ctx._source.collection = '{new_collection_id}';""" # noqa: E702 ), }, }, @@ -1000,9 +1004,9 @@ async def delete_item(self, item_id: str, collection_id: str, **kwargs: Any): ) try: - await self.client.delete( + await self.client.delete_by_query( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), + body={"query": {"term": {"_id": mk_item_id(item_id, collection_id)}}}, refresh=refresh, ) except exceptions.NotFoundError: @@ -1093,8 +1097,10 @@ async def create_collection(self, collection: Collection, **kwargs: Any): body=collection, refresh=refresh, ) - - await create_item_index(collection_id) + if self.async_index_inserter.should_create_collection_index(): + await self.async_index_inserter.create_simple_index( + self.client, collection_id + ) async def find_collection(self, collection_id: str) -> Collection: """Find and return a collection from the database. @@ -1303,6 +1309,7 @@ async def delete_collection(self, collection_id: str, **kwargs: Any): await self.client.delete( index=COLLECTIONS_INDEX, id=collection_id, refresh=refresh ) + # Delete the item index for the collection await delete_item_index(collection_id) async def bulk_async( @@ -1356,9 +1363,13 @@ async def bulk_async( return 0, [] raise_on_error = self.async_settings.raise_on_bulk_error + actions = await self.async_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) + success, errors = await helpers.async_bulk( self.client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) @@ -1413,6 +1424,11 @@ def bulk_sync( f"Performing bulk insert for collection {collection_id} with refresh={refresh}" ) + # Handle empty processed_items + if not processed_items: + logger.warning(f"No items to insert for collection {collection_id}") + return 0, [] + # Handle empty processed_items if not processed_items: logger.warning(f"No items to insert for collection {collection_id}") diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py index 1f335245..641c81f1 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py @@ -21,6 +21,7 @@ Aggregation, AggregationCollection, ) +from stac_fastapi.sfeos_helpers.database import return_date from stac_fastapi.types.rfc3339 import DateTimeType from .format import frequency_agg, metric_agg @@ -312,9 +313,10 @@ async def aggregate( search=search, item_ids=aggregate_request.ids ) + datetime_search = return_date(aggregate_request.datetime) if aggregate_request.datetime: search = self.database.apply_datetime_filter( - search=search, interval=aggregate_request.datetime + search=search, datetime_search=datetime_search ) if aggregate_request.bbox: @@ -414,6 +416,7 @@ async def aggregate( geometry_geohash_grid_precision, geometry_geotile_grid_precision, datetime_frequency_interval, + datetime_search, ) except Exception as error: if not isinstance(error, IndexError): diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py index 31bf28d8..bacf1ac3 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py @@ -30,11 +30,12 @@ """ # Re-export all functions for backward compatibility -from .datetime import return_date +from .datetime import extract_date, extract_first_date_from_index, return_date from .document import mk_actions, mk_item_id from .index import ( create_index_templates_shared, delete_item_index_shared, + filter_indexes_by_datetime, index_alias_by_collection_id, index_by_collection_id, indices, @@ -53,6 +54,7 @@ "delete_item_index_shared", "index_alias_by_collection_id", "index_by_collection_id", + "filter_indexes_by_datetime", "indices", # Query operations "apply_free_text_filter_shared", @@ -68,4 +70,6 @@ "get_bool_env", # Datetime utilities "return_date", + "extract_date", + "extract_first_date_from_index", ] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py index 352ed4b5..d6b68e85 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py @@ -4,14 +4,19 @@ Elasticsearch and OpenSearch query formatting. """ +import logging +import re +from datetime import date from datetime import datetime as datetime_type from typing import Dict, Optional, Union from stac_fastapi.types.rfc3339 import DateTimeType +logger = logging.getLogger(__name__) + def return_date( - interval: Optional[Union[DateTimeType, str]] + interval: Optional[Union[DateTimeType, str]], ) -> Dict[str, Optional[str]]: """ Convert a date interval to an Elasticsearch/OpenSearch query format. @@ -39,8 +44,14 @@ def return_date( if isinstance(interval, str): if "/" in interval: parts = interval.split("/") - result["gte"] = parts[0] if parts[0] != ".." else None - result["lte"] = parts[1] if len(parts) > 1 and parts[1] != ".." else None + result["gte"] = ( + parts[0] if parts[0] != ".." else datetime_type.min.isoformat() + "Z" + ) + result["lte"] = ( + parts[1] + if len(parts) > 1 and parts[1] != ".." + else datetime_type.max.isoformat() + "Z" + ) else: converted_time = interval if interval != ".." else None result["gte"] = result["lte"] = converted_time @@ -58,3 +69,53 @@ def return_date( result["lte"] = end.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" return result + + +def extract_date(date_str: str) -> date: + """Extract date from ISO format string. + + Args: + date_str: ISO format date string + + Returns: + A date object extracted from the input string. + """ + date_str = date_str.replace("Z", "+00:00") + return datetime_type.fromisoformat(date_str).date() + + +def extract_first_date_from_index(index_name: str) -> date: + """Extract the first date from an index name containing date patterns. + + Searches for date patterns (YYYY-MM-DD) within the index name string + and returns the first found date as a date object. + + Args: + index_name: Index name containing date patterns. + + Returns: + A date object extracted from the first date pattern found in the index name. + + """ + date_pattern = r"\d{4}-\d{2}-\d{2}" + match = re.search(date_pattern, index_name) + + if not match: + logger.error(f"No date pattern found in index name: '{index_name}'") + raise ValueError( + f"No date pattern (YYYY-MM-DD) found in index name: '{index_name}'" + ) + + date_string = match.group(0) + + try: + extracted_date = datetime_type.strptime(date_string, "%Y-%m-%d").date() + return extracted_date + except ValueError as e: + logger.error( + f"Invalid date format found in index name '{index_name}': " + f"'{date_string}' - {str(e)}" + ) + raise ValueError( + f"Invalid date format in index name '{index_name}': '{date_string}'" + ) from e diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py index 3305f50f..c36a36fa 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py @@ -3,9 +3,13 @@ This module provides functions for creating and managing indices in Elasticsearch/OpenSearch. """ +import re +from datetime import datetime from functools import lru_cache from typing import Any, List, Optional +from dateutil.parser import parse # type: ignore[import] + from stac_fastapi.sfeos_helpers.mappings import ( _ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE, COLLECTIONS_INDEX, @@ -66,6 +70,59 @@ def indices(collection_ids: Optional[List[str]]) -> str: ) +def filter_indexes_by_datetime( + indexes: List[str], gte: Optional[str], lte: Optional[str] +) -> List[str]: + """Filter indexes based on datetime range extracted from index names. + + Args: + indexes: List of index names containing dates + gte: Greater than or equal date filter (ISO format, optional 'Z' suffix) + lte: Less than or equal date filter (ISO format, optional 'Z' suffix) + + Returns: + List of filtered index names + """ + + def parse_datetime(dt_str: str) -> datetime: + """Parse datetime string, handling both with and without 'Z' suffix.""" + return parse(dt_str).replace(tzinfo=None) + + def extract_date_range_from_index(index_name: str) -> tuple: + """Extract start and end dates from index name.""" + date_pattern = r"(\d{4}-\d{2}-\d{2})" + dates = re.findall(date_pattern, index_name) + + if len(dates) == 1: + start_date = datetime.strptime(dates[0], "%Y-%m-%d") + max_date = datetime.max.replace(microsecond=0) + return start_date, max_date + else: + start_date = datetime.strptime(dates[0], "%Y-%m-%d") + end_date = datetime.strptime(dates[1], "%Y-%m-%d") + return start_date, end_date + + def is_index_in_range( + start_date: datetime, end_date: datetime, gte_dt: datetime, lte_dt: datetime + ) -> bool: + """Check if index date range overlaps with filter range.""" + return not ( + end_date.date() < gte_dt.date() or start_date.date() > lte_dt.date() + ) + + gte_dt = parse_datetime(gte) if gte else datetime.min.replace(microsecond=0) + lte_dt = parse_datetime(lte) if lte else datetime.max.replace(microsecond=0) + + filtered_indexes = [] + + for index in indexes: + start_date, end_date = extract_date_range_from_index(index) + if is_index_in_range(start_date, end_date, gte_dt, lte_dt): + filtered_indexes.append(index) + + return filtered_indexes + + async def create_index_templates_shared(settings: Any) -> None: """Create index templates for Elasticsearch/OpenSearch Collection and Item indices. @@ -120,11 +177,11 @@ async def delete_item_index_shared(settings: Any, collection_id: str) -> None: client = settings.create_client name = index_alias_by_collection_id(collection_id) - resolved = await client.indices.resolve_index(name=name) + resolved = await client.indices.resolve_index(name=name, ignore=[404]) if "aliases" in resolved and resolved["aliases"]: [alias] = resolved["aliases"] await client.indices.delete_alias(index=alias["indices"], name=alias["name"]) await client.indices.delete(index=alias["indices"]) else: - await client.indices.delete(index=name) + await client.indices.delete(index=name, ignore=[404]) await client.close() diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/__init__.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/__init__.py new file mode 100644 index 00000000..84b3bc32 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/__init__.py @@ -0,0 +1,27 @@ +"""Search engine index management package.""" + +from .base import BaseIndexInserter +from .factory import IndexInsertionFactory +from .index_operations import IndexOperations +from .inserters import DatetimeIndexInserter, SimpleIndexInserter +from .managers import DatetimeIndexManager, IndexSizeManager +from .selection import ( + BaseIndexSelector, + DatetimeBasedIndexSelector, + IndexSelectorFactory, + UnfilteredIndexSelector, +) + +__all__ = [ + "BaseIndexInserter", + "BaseIndexSelector", + "IndexOperations", + "IndexSizeManager", + "DatetimeIndexManager", + "DatetimeIndexInserter", + "SimpleIndexInserter", + "IndexInsertionFactory", + "DatetimeBasedIndexSelector", + "UnfilteredIndexSelector", + "IndexSelectorFactory", +] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/base.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/base.py new file mode 100644 index 00000000..46f9c6f5 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/base.py @@ -0,0 +1,51 @@ +"""Base classes for index inserters.""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List + + +class BaseIndexInserter(ABC): + """Base async index inserter with common async methods.""" + + @abstractmethod + async def get_target_index( + self, collection_id: str, product: Dict[str, Any] + ) -> str: + """Get target index for a product asynchronously. + + Args: + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data. + + Returns: + str: Target index name. + """ + pass + + @abstractmethod + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Prepare bulk actions for multiple items asynchronously. + + Args: + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + List[Dict[str, Any]]: List of bulk actions. + """ + pass + + @abstractmethod + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index asynchronously. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + pass diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/factory.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/factory.py new file mode 100644 index 00000000..a69df558 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/factory.py @@ -0,0 +1,36 @@ +"""Factory for creating index insertion strategies.""" + +from typing import Any + +from stac_fastapi.core.utilities import get_bool_env + +from .base import BaseIndexInserter +from .index_operations import IndexOperations +from .inserters import DatetimeIndexInserter, SimpleIndexInserter + + +class IndexInsertionFactory: + """Factory for creating index insertion strategies.""" + + @staticmethod + def create_insertion_strategy( + client: Any, + ) -> BaseIndexInserter: + """Create async insertion strategy based on configuration. + + Args: + client: Async search engine client instance. + + Returns: + BaseIndexInserter: Configured async insertion strategy. + """ + index_operations = IndexOperations() + + use_datetime_partitioning = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + if use_datetime_partitioning: + return DatetimeIndexInserter(client, index_operations) + else: + return SimpleIndexInserter(index_operations, client) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/index_operations.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/index_operations.py new file mode 100644 index 00000000..42028a7a --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/index_operations.py @@ -0,0 +1,167 @@ +"""Search engine adapters for different implementations.""" + +import uuid +from typing import Any, Dict + +from stac_fastapi.sfeos_helpers.database import ( + index_alias_by_collection_id, + index_by_collection_id, +) +from stac_fastapi.sfeos_helpers.mappings import ( + _ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE, + ES_ITEMS_MAPPINGS, + ES_ITEMS_SETTINGS, + ITEMS_INDEX_PREFIX, +) + + +class IndexOperations: + """Base class for search engine adapters with common implementations.""" + + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index for the given collection. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + index_name = f"{index_by_collection_id(collection_id)}-000001" + alias_name = index_alias_by_collection_id(collection_id) + + await client.indices.create( + index=index_name, + body=self._create_index_body({alias_name: {}}), + params={"ignore": [400]}, + ) + return index_name + + async def create_datetime_index( + self, client: Any, collection_id: str, start_date: str + ) -> str: + """Create a datetime-based index for the given collection. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + start_date (str): Start date for the alias. + + Returns: + str: Created index alias name. + """ + index_name = self.create_index_name(collection_id) + alias_name = self.create_alias_name(collection_id, start_date) + collection_alias = index_alias_by_collection_id(collection_id) + await client.indices.create( + index=index_name, + body=self._create_index_body({collection_alias: {}, alias_name: {}}), + ) + return alias_name + + @staticmethod + async def update_index_alias(client: Any, end_date: str, old_alias: str) -> str: + """Update index alias with new end date. + + Args: + client: Search engine client instance. + end_date (str): End date for the alias. + old_alias (str): Current alias name. + + Returns: + str: New alias name. + """ + new_alias = f"{old_alias}-{end_date}" + aliases_info = await client.indices.get_alias(name=old_alias) + actions = [] + + for index_name in aliases_info.keys(): + actions.append({"remove": {"index": index_name, "alias": old_alias}}) + actions.append({"add": {"index": index_name, "alias": new_alias}}) + + await client.indices.update_aliases(body={"actions": actions}) + return new_alias + + @staticmethod + async def change_alias_name(client: Any, old_alias: str, new_alias: str) -> None: + """Change alias name from old to new. + + Args: + client: Search engine client instance. + old_alias (str): Current alias name. + new_alias (str): New alias name. + + Returns: + None + """ + aliases_info = await client.indices.get_alias(name=old_alias) + actions = [] + + for index_name in aliases_info.keys(): + actions.append({"remove": {"index": index_name, "alias": old_alias}}) + actions.append({"add": {"index": index_name, "alias": new_alias}}) + await client.indices.update_aliases(body={"actions": actions}) + + @staticmethod + def create_index_name(collection_id: str) -> str: + """Create index name from collection ID and uuid4. + + Args: + collection_id (str): Collection identifier. + + Returns: + str: Formatted index name. + """ + cleaned = collection_id.translate(_ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE) + return f"{ITEMS_INDEX_PREFIX}{cleaned.lower()}_{uuid.uuid4()}" + + @staticmethod + def create_alias_name(collection_id: str, start_date: str) -> str: + """Create index name from collection ID and uuid4. + + Args: + collection_id (str): Collection identifier. + start_date (str): Start date for the alias. + + Returns: + str: Alias name with initial date. + """ + cleaned = collection_id.translate(_ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE) + return f"{ITEMS_INDEX_PREFIX}{cleaned.lower()}_{start_date}" + + @staticmethod + def _create_index_body(aliases: Dict[str, Dict]) -> Dict[str, Any]: + """Create index body with common settings. + + Args: + aliases (Dict[str, Dict]): Aliases configuration. + + Returns: + Dict[str, Any]: Index body configuration. + """ + return { + "aliases": aliases, + "mappings": ES_ITEMS_MAPPINGS, + "settings": ES_ITEMS_SETTINGS, + } + + @staticmethod + async def find_latest_item_in_index(client: Any, index_name: str) -> dict[str, Any]: + """Find the latest item date in the specified index. + + Args: + client: Search engine client instance. + index_name (str): Name of the index to query. + + Returns: + datetime: Date of the latest item in the index. + """ + query = { + "size": 1, + "sort": [{"properties.datetime": {"order": "desc"}}], + "_source": ["properties.datetime"], + } + + response = await client.search(index=index_name, body=query) + return response["hits"]["hits"][0] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/inserters.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/inserters.py new file mode 100644 index 00000000..06e9c729 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/inserters.py @@ -0,0 +1,309 @@ +"""Async index insertion strategies.""" +import logging +from datetime import timedelta +from typing import Any, Dict, List + +from fastapi import HTTPException, status + +from stac_fastapi.sfeos_helpers.database import ( + extract_date, + extract_first_date_from_index, + index_alias_by_collection_id, + mk_item_id, +) + +from .base import BaseIndexInserter +from .index_operations import IndexOperations +from .managers import DatetimeIndexManager +from .selection import DatetimeBasedIndexSelector + +logger = logging.getLogger(__name__) + + +class DatetimeIndexInserter(BaseIndexInserter): + """Async datetime-based index insertion strategy.""" + + def __init__(self, client: Any, index_operations: IndexOperations): + """Initialize the async datetime index inserter. + + Args: + client: Async search engine client instance. + index_operations (IndexOperations): Search engine adapter instance. + """ + self.client = client + self.index_operations = index_operations + self.datetime_manager = DatetimeIndexManager(client, index_operations) + + @staticmethod + def should_create_collection_index() -> bool: + """Whether this strategy requires collection index creation. + + Returns: + bool: False, as datetime strategy doesn't create collection indexes. + """ + return False + + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index asynchronously. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + return await self.index_operations.create_simple_index(client, collection_id) + + async def get_target_index( + self, collection_id: str, product: Dict[str, Any] + ) -> str: + """Get target index for a single product. + + Args: + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data containing datetime information. + + Returns: + str: Target index name for the product. + """ + index_selector = DatetimeBasedIndexSelector(self.client) + return await self._get_target_index_internal( + index_selector, collection_id, product, check_size=True + ) + + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Prepare bulk actions for multiple items. + + Args: + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + List[Dict[str, Any]]: List of bulk actions ready for execution. + """ + if not items: + msg = "The product list cannot be empty." + logger.error(msg) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=msg) + + items.sort(key=lambda item: item["properties"]["datetime"]) + index_selector = DatetimeBasedIndexSelector(self.client) + + await self._ensure_indexes_exist(index_selector, collection_id, items) + await self._check_and_handle_oversized_index( + index_selector, collection_id, items + ) + + actions = [] + for item in items: + target_index = await self._get_target_index_internal( + index_selector, collection_id, item, check_size=False + ) + actions.append( + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + ) + + return actions + + async def _get_target_index_internal( + self, + index_selector, + collection_id: str, + product: Dict[str, Any], + check_size: bool = True, + ) -> str: + """Get target index with size checking internally. + + Args: + index_selector: Index selector instance. + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data. + check_size (bool): Whetheru to check index size limits. + + Returns: + str: Target index name. + """ + product_datetime = self.datetime_manager.validate_product_datetime(product) + datetime_range = {"gte": product_datetime, "lte": product_datetime} + target_index = await index_selector.select_indexes( + [collection_id], datetime_range + ) + all_indexes = await index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + target_index = await self.datetime_manager.handle_new_collection( + collection_id, product_datetime + ) + await index_selector.refresh_cache() + return target_index + + all_indexes.sort() + start_date = extract_date(product_datetime) + end_date = extract_first_date_from_index(all_indexes[0]) + + if start_date < end_date: + alias = await self.datetime_manager.handle_early_date( + collection_id, start_date, end_date + ) + await index_selector.refresh_cache() + + return alias + + if target_index != all_indexes[-1]: + return target_index + + if check_size and await self.datetime_manager.size_manager.is_index_oversized( + target_index + ): + target_index = await self.datetime_manager.handle_oversized_index( + collection_id, target_index, product_datetime + ) + await index_selector.refresh_cache() + + return target_index + + async def _ensure_indexes_exist( + self, index_selector, collection_id: str, items: List[Dict[str, Any]] + ): + """Ensure necessary indexes exist for the items. + + Args: + index_selector: Index selector instance. + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + """ + all_indexes = await index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + first_item = items[0] + await self.index_operations.create_datetime_index( + self.client, + collection_id, + extract_date(first_item["properties"]["datetime"]), + ) + await index_selector.refresh_cache() + + async def _check_and_handle_oversized_index( + self, index_selector, collection_id: str, items: List[Dict[str, Any]] + ) -> None: + """Check if index is oversized and create new index if needed. + + Checks if the index where the first item would be inserted is oversized. + If so, creates a new index starting from the next day. + + Args: + index_selector: Index selector instance. + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + None + """ + first_item = items[0] + first_item_index = await self._get_target_index_internal( + index_selector, collection_id, first_item, check_size=False + ) + + all_indexes = await index_selector.get_collection_indexes(collection_id) + all_indexes.sort() + latest_index = all_indexes[-1] + + if first_item_index != latest_index: + return None + + if not await self.datetime_manager.size_manager.is_index_oversized( + first_item_index + ): + return None + + latest_item = await self.index_operations.find_latest_item_in_index( + self.client, latest_index + ) + product_datetime = latest_item["_source"]["properties"]["datetime"] + end_date = extract_date(product_datetime) + await self.index_operations.update_index_alias( + self.client, str(end_date), latest_index + ) + next_day_start = end_date + timedelta(days=1) + await self.index_operations.create_datetime_index( + self.client, collection_id, str(next_day_start) + ) + await index_selector.refresh_cache() + + +class SimpleIndexInserter(BaseIndexInserter): + """Simple async index insertion strategy.""" + + def __init__(self, index_operations: IndexOperations, client: Any): + """Initialize the async simple index inserter. + + Args: + index_operations (IndexOperations): Search engine adapter instance. + client: Async search engine client instance. + """ + self.search_adapter = index_operations + self.client = client + + @staticmethod + def should_create_collection_index() -> bool: + """Whether this strategy requires collection index creation. + + Returns: + bool: True, as simple strategy creates collection indexes. + """ + return True + + async def create_simple_index(self, client: Any, collection_id: str) -> str: + """Create a simple index asynchronously. + + Args: + client: Search engine client instance. + collection_id (str): Collection identifier. + + Returns: + str: Created index name. + """ + return await self.search_adapter.create_simple_index(client, collection_id) + + async def get_target_index( + self, collection_id: str, product: Dict[str, Any] + ) -> str: + """Get target index (always the collection alias). + + Args: + collection_id (str): Collection identifier. + product (Dict[str, Any]): Product data (not used in simple strategy). + + Returns: + str: Collection alias name. + """ + return index_alias_by_collection_id(collection_id) + + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Prepare bulk actions for simple indexing. + + Args: + collection_id (str): Collection identifier. + items (List[Dict[str, Any]]): List of items to process. + + Returns: + List[Dict[str, Any]]: List of bulk actions with collection alias as target. + """ + target_index = index_alias_by_collection_id(collection_id) + return [ + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + for item in items + ] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/managers.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/managers.py new file mode 100644 index 00000000..1194e634 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/managers.py @@ -0,0 +1,198 @@ +"""Index management utilities.""" + +import logging +import os +from datetime import datetime, timedelta +from typing import Any, Dict + +from fastapi import HTTPException, status + +from stac_fastapi.sfeos_helpers.database import ( + extract_date, + extract_first_date_from_index, +) + +from .index_operations import IndexOperations + +logger = logging.getLogger(__name__) + + +class IndexSizeManager: + """Manages index size limits and operations.""" + + def __init__(self, client: Any): + """Initialize the index size manager. + + Args: + client: Search engine client instance. + """ + self.client = client + self.max_size_gb = self._get_max_size_from_env() + + async def get_index_size_in_gb(self, index_name: str) -> float: + """Get index size in gigabytes asynchronously. + + Args: + index_name (str): Name of the index to check. + + Returns: + float: Size of the index in gigabytes. + """ + data = await self.client.indices.stats(index=index_name) + return data["_all"]["primaries"]["store"]["size_in_bytes"] / 1e9 + + async def is_index_oversized(self, index_name: str) -> bool: + """Check if index exceeds size limit asynchronously. + + Args: + index_name (str): Name of the index to check. + + Returns: + bool: True if index exceeds size limit, False otherwise. + """ + size_gb = await self.get_index_size_in_gb(index_name) + is_oversized = size_gb > self.max_size_gb + gb_milestone = int(size_gb) + if gb_milestone > 0: + logger.info(f"Index '{index_name}' size: {gb_milestone}GB") + + if is_oversized: + logger.warning( + f"Index '{index_name}' is oversized: {size_gb:.2f} GB " + f"(limit: {self.max_size_gb} GB)" + ) + + return is_oversized + + @staticmethod + def _get_max_size_from_env() -> float: + """Get max size from environment variable with error handling. + + Returns: + float: Maximum index size in GB. + + Raises: + ValueError: If environment variable contains invalid value. + """ + env_value = os.getenv("DATETIME_INDEX_MAX_SIZE_GB", "25") + + try: + max_size = float(env_value) + if max_size <= 0: + raise ValueError( + f"DATETIME_INDEX_MAX_SIZE_GB must be positive, got: {max_size}" + ) + return max_size + except (ValueError, TypeError): + error_msg = ( + f"Invalid value for DATETIME_INDEX_MAX_SIZE_GB environment variable: " + f"'{env_value}'. Must be a positive number. Using default value 25.0 GB." + ) + logger.warning(error_msg) + + return 25.0 + + +class DatetimeIndexManager: + """Manages datetime-based index operations.""" + + def __init__(self, client: Any, index_operations: IndexOperations): + """Initialize the datetime index manager. + + Args: + client: Search engine client instance. + index_operations (IndexOperations): Search engine adapter instance. + """ + self.client = client + self.index_operations = index_operations + self.size_manager = IndexSizeManager(client) + + @staticmethod + def validate_product_datetime(product: Dict[str, Any]) -> str: + """Validate and extract datetime from product. + + Args: + product (Dict[str, Any]): Product data containing datetime information. + + Returns: + str: Validated product datetime. + + Raises: + HTTPException: If product datetime is missing or invalid. + """ + product_datetime = product["properties"]["datetime"] + if not product_datetime: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Product datetime is required for indexing", + ) + return product_datetime + + async def handle_new_collection( + self, collection_id: str, product_datetime: str + ) -> str: + """Handle index creation for new collection asynchronously. + + Args: + collection_id (str): Collection identifier. + product_datetime (str): Product datetime for index naming. + + + Returns: + str: Created index name. + """ + target_index = await self.index_operations.create_datetime_index( + self.client, collection_id, extract_date(product_datetime) + ) + logger.info( + f"Successfully created index '{target_index}' for collection '{collection_id}'" + ) + return target_index + + async def handle_early_date( + self, collection_id: str, start_date: datetime, end_date: datetime + ) -> str: + """Handle product with date earlier than existing indexes asynchronously. + + Args: + collection_id (str): Collection identifier. + start_date (datetime): Start date for the new index. + end_date (datetime): End date for alias update. + + Returns: + str: Updated alias name. + """ + old_alias = self.index_operations.create_alias_name( + collection_id, str(end_date) + ) + new_alias = self.index_operations.create_alias_name( + collection_id, str(start_date) + ) + await self.index_operations.change_alias_name(self.client, old_alias, new_alias) + return new_alias + + async def handle_oversized_index( + self, collection_id: str, target_index: str, product_datetime: str + ) -> str: + """Handle index that exceeds size limit asynchronously. + + Args: + collection_id (str): Collection identifier. + target_index (str): Current target index name. + product_datetime (str): Product datetime for new index. + + Returns: + str: New or updated index name. + """ + end_date = extract_date(product_datetime) + latest_index_start = extract_first_date_from_index(target_index) + + if end_date != latest_index_start: + await self.index_operations.update_index_alias( + self.client, str(end_date), target_index + ) + target_index = await self.index_operations.create_datetime_index( + self.client, collection_id, str(end_date + timedelta(days=1)) + ) + + return target_index diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/__init__.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/__init__.py new file mode 100644 index 00000000..cf68159c --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/__init__.py @@ -0,0 +1,15 @@ +"""Index selection strategies package.""" + +from .base import BaseIndexSelector +from .cache_manager import IndexAliasLoader, IndexCacheManager +from .factory import IndexSelectorFactory +from .selectors import DatetimeBasedIndexSelector, UnfilteredIndexSelector + +__all__ = [ + "IndexCacheManager", + "IndexAliasLoader", + "DatetimeBasedIndexSelector", + "UnfilteredIndexSelector", + "IndexSelectorFactory", + "BaseIndexSelector", +] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/base.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/base.py new file mode 100644 index 00000000..95f40672 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/base.py @@ -0,0 +1,30 @@ +"""Base classes for index selection strategies.""" + +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + + +class BaseIndexSelector(ABC): + """Base class for async index selectors.""" + + @abstractmethod + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select appropriate indexes asynchronously. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + datetime_search (Dict[str, Optional[str]]): Datetime search criteria. + + Returns: + str: Comma-separated string of selected index names. + """ + pass + + @abstractmethod + async def refresh_cache(self): + """Refresh cache (no-op for unfiltered selector).""" + pass diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/cache_manager.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/cache_manager.py new file mode 100644 index 00000000..3b65244d --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/cache_manager.py @@ -0,0 +1,127 @@ +"""Cache management for index selection strategies.""" + +import threading +import time +from collections import defaultdict +from typing import Any, Dict, List, Optional + +from stac_fastapi.sfeos_helpers.database import index_alias_by_collection_id +from stac_fastapi.sfeos_helpers.mappings import ITEMS_INDEX_PREFIX + + +class IndexCacheManager: + """Manages caching of index aliases with expiration.""" + + def __init__(self, cache_ttl_seconds: int = 3600): + """Initialize the cache manager. + + Args: + cache_ttl_seconds (int): Time-to-live for cache entries in seconds. + """ + self._cache: Optional[Dict[str, List[str]]] = None + self._timestamp: float = 0 + self._ttl = cache_ttl_seconds + self._lock = threading.Lock() + + @property + def is_expired(self) -> bool: + """Check if the cache has expired. + + Returns: + bool: True if cache is expired, False otherwise. + """ + return time.time() - self._timestamp > self._ttl + + def get_cache(self) -> Optional[Dict[str, List[str]]]: + """Get the current cache if not expired. + + Returns: + Optional[Dict[str, List[str]]]: Cache data if valid, None if expired. + """ + with self._lock: + if self.is_expired: + return None + return {k: v.copy() for k, v in self._cache.items()} + + def set_cache(self, data: Dict[str, List[str]]) -> None: + """Set cache data and update timestamp. + + Args: + data (Dict[str, List[str]]): Cache data to store. + """ + self._cache = data + self._timestamp = time.time() + + def clear_cache(self) -> None: + """Clear the cache and reset timestamp.""" + self._cache = None + self._timestamp = 0 + + +class IndexAliasLoader: + """Asynchronous loader for index aliases.""" + + def __init__(self, client: Any, cache_manager: IndexCacheManager): + """Initialize the async alias loader. + + Args: + client: Async search engine client instance. + cache_manager (IndexCacheManager): Cache manager instance. + """ + self.client = client + self.cache_manager = cache_manager + + async def load_aliases(self) -> Dict[str, List[str]]: + """Load index aliases from search engine. + + Returns: + Dict[str, List[str]]: Mapping of base aliases to item aliases. + """ + response = await self.client.indices.get_alias(index=f"{ITEMS_INDEX_PREFIX}*") + result = defaultdict(list) + for index_info in response.values(): + aliases = index_info.get("aliases", {}) + items_aliases = sorted( + [ + alias + for alias in aliases.keys() + if alias.startswith(ITEMS_INDEX_PREFIX) + ] + ) + + if items_aliases: + result[items_aliases[0]].extend(items_aliases[1:]) + + self.cache_manager.set_cache(result) + return result + + async def get_aliases(self) -> Dict[str, List[str]]: + """Get aliases from cache or load if expired. + + Returns: + Dict[str, List[str]]: Alias mapping data. + """ + cached = self.cache_manager.get_cache() + if cached is not None: + return cached + return await self.load_aliases() + + async def refresh_aliases(self) -> Dict[str, List[str]]: + """Force refresh aliases from search engine. + + Returns: + Dict[str, List[str]]: Fresh alias mapping data. + """ + return await self.load_aliases() + + async def get_collection_indexes(self, collection_id: str) -> List[str]: + """Get all index aliases for a specific collection. + + Args: + collection_id (str): Collection identifier. + + Returns: + List[str]: List of index aliases for the collection. + """ + aliases = await self.get_aliases() + return aliases.get(index_alias_by_collection_id(collection_id), []) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/factory.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/factory.py new file mode 100644 index 00000000..4ada945b --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/factory.py @@ -0,0 +1,37 @@ +"""Factory for creating index selection strategies.""" + +from typing import Any + +from stac_fastapi.core.utilities import get_bool_env + +from .base import BaseIndexSelector +from .selectors import DatetimeBasedIndexSelector, UnfilteredIndexSelector + + +class IndexSelectorFactory: + """Factory class for creating index selector instances.""" + + @staticmethod + def create_selector(client: Any) -> BaseIndexSelector: + """Create an appropriate asynchronous index selector based on environment configuration. + + Checks the ENABLE_DATETIME_INDEX_FILTERING environment variable to determine + whether to use datetime-based filtering or return all available indices. + + Args: + client: Asynchronous Elasticsearch/OpenSearch client instance, used only if datetime + filtering is enabled. + + Returns: + IndexSelectionStrategy: Either an AsyncDatetimeBasedIndexSelector if datetime + filtering is enabled, or an UnfilteredIndexSelector otherwise. + """ + use_datetime_filtering = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + return ( + DatetimeBasedIndexSelector(client) + if use_datetime_filtering + else UnfilteredIndexSelector() + ) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/selectors.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/selectors.py new file mode 100644 index 00000000..20f919ab --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/search_engine/selection/selectors.py @@ -0,0 +1,129 @@ +"""Async index selectors with datetime-based filtering.""" + +from typing import Any, Dict, List, Optional + +from stac_fastapi.sfeos_helpers.database import filter_indexes_by_datetime +from stac_fastapi.sfeos_helpers.mappings import ITEM_INDICES + +from ...database import indices +from .base import BaseIndexSelector +from .cache_manager import IndexAliasLoader, IndexCacheManager + + +class DatetimeBasedIndexSelector(BaseIndexSelector): + """Asynchronous index selector that filters indices based on datetime criteria with caching.""" + + _instance = None + + def __new__(cls, client): + """Create singleton instance. + + Args: + client: Async search engine client instance. + + Returns: + DatetimeBasedIndexSelector: Singleton instance. + """ + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self, client: Any): + """Initialize the datetime-based index selector. + + Args: + client: Elasticsearch/OpenSearch client instance used for querying + index aliases and metadata. + """ + if not hasattr(self, "_initialized"): + self.cache_manager = IndexCacheManager() + self.alias_loader = IndexAliasLoader(client, self.cache_manager) + self._initialized = True + + async def refresh_cache(self) -> Dict[str, List[str]]: + """Force refresh of the aliases cache. + + Returns: + Dict[str, List[str]]: Refreshed dictionary mapping base collection aliases + to lists of their corresponding item index aliases. + """ + return await self.alias_loader.refresh_aliases() + + async def get_collection_indexes(self, collection_id: str) -> List[str]: + """Get all index aliases for a specific collection. + + Args: + collection_id (str): The ID of the collection to retrieve indexes for. + + Returns: + List[str]: List of index aliases associated with the collection. + Returns empty list if collection is not found in cache. + """ + return await self.alias_loader.get_collection_indexes(collection_id) + + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select indexes filtered by collection IDs and datetime criteria. + + For each specified collection, retrieves its associated indexes and filters + them based on datetime range. If no collection IDs are provided, returns + all item indices. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None or empty, returns all item indices. + datetime_search (Dict[str, Optional[str]]): Dictionary containing datetime + search criteria with 'gte' and 'lte' keys for range filtering. + + Returns: + str: Comma-separated string of selected index names that match the + collection and datetime criteria. Returns empty string if no + indexes match the criteria. + """ + if collection_ids: + selected_indexes = [] + for collection_id in collection_ids: + collection_indexes = await self.get_collection_indexes(collection_id) + filtered_indexes = filter_indexes_by_datetime( + collection_indexes, + datetime_search.get("gte"), + datetime_search.get("lte"), + ) + selected_indexes.extend(filtered_indexes) + + return ",".join(selected_indexes) if selected_indexes else "" + + return ITEM_INDICES + + +class UnfilteredIndexSelector(BaseIndexSelector): + """Index selector that returns all available indices without filtering.""" + + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select all indices for given collections without datetime filtering. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None, all collections are considered. + datetime_search (Dict[str, Optional[str]]): Datetime search criteria + (ignored by this implementation). + + Returns: + str: Comma-separated string of all available index names for the collections. + """ + return indices(collection_ids) + + async def refresh_cache(self): + """Refresh cache (no-op for unfiltered selector). + + Note: + Unfiltered selector doesn't use cache, so this is a no-op operation. + """ + pass diff --git a/stac_fastapi/tests/api/test_api.py b/stac_fastapi/tests/api/test_api.py index efc97174..a9de4460 100644 --- a/stac_fastapi/tests/api/test_api.py +++ b/stac_fastapi/tests/api/test_api.py @@ -1,7 +1,9 @@ +import os import random import uuid from copy import deepcopy from datetime import datetime, timedelta +from unittest.mock import patch import pytest @@ -25,6 +27,7 @@ "GET /collections/{collection_id}", "GET /collections/{collection_id}/queryables", "GET /collections/{collection_id}/items", + "POST /collections/{collection_id}/bulk_items", "GET /collections/{collection_id}/items/{item_id}", "GET /search", "POST /search", @@ -427,6 +430,9 @@ async def test_search_point_does_not_intersect(app_client, ctx): @pytest.mark.asyncio async def test_datetime_response_format(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -464,6 +470,9 @@ async def test_datetime_response_format(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_non_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -500,6 +509,9 @@ async def test_datetime_non_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -536,6 +548,9 @@ async def test_datetime_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_bad_non_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -572,6 +587,9 @@ async def test_datetime_bad_non_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_bad_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -823,3 +841,632 @@ async def test_big_int_eo_search( results = {x["properties"][attr] for x in resp_json["features"]} assert len(results) == expected assert results == {value} + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_in_past_date_change_alias_name_for_datetime_index( + app_client, ctx, load_test_data, txn_client +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2012-02-12T12:30:22Z" + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + assert response.status_code == 201 + indices = await txn_client.database.client.indices.get_alias( + index="items_test-collection" + ) + expected_aliases = [ + "items_test-collection_2012-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_uses_existing_datetime_index_for_datetime_index( + app_client, ctx, load_test_data, txn_client +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias( + index="items_test-collection" + ) + expected_aliases = [ + "items_test-collection_2020-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_with_different_date_same_index_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2022-02-12T12:30:22Z" + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias( + index="items_test-collection" + ) + expected_aliases = [ + "items_test-collection_2020-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_new_index_when_size_limit_exceeded_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2024-02-12T12:30:22Z" + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.return_value = 26.0 + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2020-02-12-2024-02-12", + "items_test-collection_2024-02-13", + ] + all_aliases = set() + + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + item_2 = deepcopy(item) + item_2["id"] = str(uuid.uuid4()) + item_2["properties"]["datetime"] = "2023-02-12T12:30:22Z" + response_2 = await app_client.post( + f"/collections/{item_2['collection']}/items", json=item_2 + ) + assert response_2.status_code == 201 + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_create_item_fails_without_datetime_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = None + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + assert response.status_code == 400 + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_same_date_range_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + base_item = load_test_data("test_item.json") + items_dict = {} + + for i in range(10): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2020-02-{12 + i}T12:30:22Z" + items_dict[item["id"]] = item + + payload = {"type": "FeatureCollection", "features": list(items_dict.values())} + response = await app_client.post( + f"/collections/{base_item['collection']}/items", json=payload + ) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2020-02-12", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + return all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_different_date_ranges_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + base_item = load_test_data("test_item.json") + items_dict = {} + + for i in range(3): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2020-02-{12 + i}T12:30:22Z" + items_dict[item["id"]] = item + + for i in range(2): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2010-02-{10 + i}T12:30:22Z" + items_dict[item["id"]] = item + + payload = {"type": "FeatureCollection", "features": list(items_dict.values())} + + response = await app_client.post( + f"/collections/{base_item['collection']}/items", json=payload + ) + + assert response.status_code == 201 + indices = await txn_client.database.client.indices.get_alias(index="*") + + expected_aliases = ["items_test-collection_2010-02-10"] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_size_limit_exceeded_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2019-02", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2010-02-10-2020-02-12", + "items_test-collection_2020-02-13", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_with_early_date_in_second_batch_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_aliases = [ + "items_test-collection_2008-01-15-2020-02-12", + "items_test-collection_2020-02-13", + ] + all_aliases = set() + for index_info in indices.values(): + all_aliases.update(index_info.get("aliases", {}).keys()) + assert all(alias in all_aliases for alias in expected_aliases) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_bulk_create_items_and_retrieve_by_id_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + response = await app_client.get( + f"/collections/{collection_id}/items/{base_item['id']}" + ) + assert response.status_code == 200 + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_patch_collection_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + patch_data = { + "description": "Updated description via PATCH", + } + response = await app_client.patch( + f"/collections/{collection_id}?refresh=true", json=patch_data + ) + assert response.status_code == 200 + assert response.json()["description"] == "Updated description via PATCH" + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_put_collection_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + collection_response = await app_client.get(f"/collections/{collection_id}") + assert collection_response.status_code == 200 + collection_data = collection_response.json() + + collection_data["description"] = "Updated description via PUT" + collection_data["title"] = "Updated title via PUT" + response = await app_client.put( + f"/collections/{collection_id}?refresh=true", json=collection_data + ) + assert response.json()["description"] == "Updated description via PUT" + assert response.json()["title"] == "Updated title via PUT" + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_patch_item_for_datetime_index( + app_client, load_test_data, txn_client, ctx +): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + patch_data = {"properties": {"description": "Updated description via PATCH"}} + + response = await app_client.patch( + f"/collections/{collection_id}/items/{base_item['id']}", json=patch_data + ) + assert response.status_code == 200 + assert ( + response.json()["properties"]["description"] + == "Updated description via PATCH" + ) + + +@pytest.mark.datetime_filtering +@pytest.mark.asyncio +async def test_put_item_for_datetime_index(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip("Datetime index filtering not enabled") + + base_item = load_test_data("test_item.json") + collection_id = base_item["collection"] + + def create_items(date_prefix: str, start_day: int, count: int) -> dict: + items = {} + for i in range(count): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"][ + "datetime" + ] = f"{date_prefix}-{start_day + i:02d}T12:30:22Z" + items[item["id"]] = item + return items + + with patch( + "stac_fastapi.sfeos_helpers.search_engine.managers.IndexSizeManager.get_index_size_in_gb" + ) as mock_get_size: + mock_get_size.side_effect = [10, 26] + + first_items = create_items("2010-02", start_day=10, count=2) + first_payload = { + "type": "FeatureCollection", + "features": list(first_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=first_payload + ) + assert response.status_code == 201 + + second_items = create_items("2008-01", start_day=15, count=3) + second_payload = { + "type": "FeatureCollection", + "features": list(second_items.values()), + } + response = await app_client.post( + f"/collections/{collection_id}/items", json=second_payload + ) + assert response.status_code == 201 + + item_response = await app_client.get( + f"/collections/{collection_id}/items/{base_item['id']}" + ) + assert item_response.status_code == 200 + item_data = item_response.json() + + item_data["properties"]["platform"] = "Updated platform via PUT" + response = await app_client.put( + f"/collections/{collection_id}/items/{base_item['id']}", json=item_data + ) + assert response.json()["properties"]["platform"] == "Updated platform via PUT" diff --git a/stac_fastapi/tests/conftest.py b/stac_fastapi/tests/conftest.py index d8c5fc88..23da2668 100644 --- a/stac_fastapi/tests/conftest.py +++ b/stac_fastapi/tests/conftest.py @@ -26,6 +26,7 @@ from stac_fastapi.core.rate_limit import setup_rate_limit from stac_fastapi.core.utilities import get_bool_env from stac_fastapi.sfeos_helpers.aggregation import EsAsyncBaseAggregationClient +from stac_fastapi.sfeos_helpers.mappings import ITEMS_INDEX_PREFIX if os.getenv("BACKEND", "elasticsearch").lower() == "opensearch": from stac_fastapi.opensearch.app import app_config @@ -158,6 +159,8 @@ async def delete_collections_and_items(txn_client: TransactionsClient) -> None: await refresh_indices(txn_client) await txn_client.database.delete_items() await txn_client.database.delete_collections() + await txn_client.database.client.indices.delete(index=f"{ITEMS_INDEX_PREFIX}*") + await txn_client.database.async_index_selector.refresh_cache() async def refresh_indices(txn_client: TransactionsClient) -> None: diff --git a/stac_fastapi/tests/database/test_database.py b/stac_fastapi/tests/database/test_database.py index 86611235..67897c15 100644 --- a/stac_fastapi/tests/database/test_database.py +++ b/stac_fastapi/tests/database/test_database.py @@ -1,3 +1,4 @@ +import os import uuid import pytest @@ -27,6 +28,9 @@ async def test_index_mapping_collections(ctx): @pytest.mark.asyncio async def test_index_mapping_items(txn_client, load_test_data): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + collection = load_test_data("test_collection.json") collection["id"] = str(uuid.uuid4()) await txn_client.create_collection( diff --git a/stac_fastapi/tests/resources/test_item.py b/stac_fastapi/tests/resources/test_item.py index 0102bf9b..0299cdc0 100644 --- a/stac_fastapi/tests/resources/test_item.py +++ b/stac_fastapi/tests/resources/test_item.py @@ -114,8 +114,15 @@ async def test_create_uppercase_collection_with_item( async def test_update_item_already_exists(app_client, ctx, load_test_data): """Test updating an item which already exists (transactions extension)""" item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) assert item["properties"]["gsd"] != 16 item["properties"]["gsd"] = 16 + + response = await app_client.post( + f"/collections/{item['collection']}/items", json=item + ) + assert response.status_code == 201 + await app_client.put( f"/collections/{item['collection']}/items/{item['id']}", json=item ) @@ -998,6 +1005,9 @@ async def _search_and_get_ids( async def test_search_datetime_with_null_datetime( app_client, txn_client, load_test_data ): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + """Test datetime filtering when properties.datetime is null or set, ensuring start_datetime and end_datetime are set when datetime is null.""" # Setup: Create test collection test_collection = load_test_data("test_collection.json") diff --git a/tox.ini b/tox.ini index 100ee64c..546c7767 100644 --- a/tox.ini +++ b/tox.ini @@ -13,4 +13,8 @@ max-line-length = 90 profile=black known_first_party = stac_fastapi known_third_party = rasterio,stac-pydantic,sqlalchemy,geoalchemy2,fastapi -sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER \ No newline at end of file +sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER + +[tool:pytest] +markers = + datetime_filtering: tests that require ENABLE_DATETIME_INDEX_FILTERING=true \ No newline at end of file From c9668ac78d311d9db318d78ef509e408be53297c Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Thu, 7 Aug 2025 17:36:21 +0200 Subject: [PATCH 02/11] fix --- .../opensearch/stac_fastapi/opensearch/database_logic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index 83ec6821..aebfc5ad 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -46,7 +46,7 @@ merge_to_operations, operations_to_script, ) -from stac_fastapi.sfeos_helpers.filter import filter as filter_module +from stac_fastapi.sfeos_helpers import filter as filter_module from stac_fastapi.sfeos_helpers.mappings import ( AGGREGATION_MAPPING, COLLECTIONS_INDEX, From db4a4dc3d6e35c3042c342a3570e463809bb5068 Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Thu, 7 Aug 2025 17:40:44 +0200 Subject: [PATCH 03/11] isort --- .../opensearch/stac_fastapi/opensearch/database_logic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index aebfc5ad..0fc26c3d 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -26,6 +26,7 @@ AsyncOpensearchSettings as AsyncSearchSettings, ) from stac_fastapi.opensearch.config import OpensearchSettings as SyncSearchSettings +from stac_fastapi.sfeos_helpers import filter as filter_module from stac_fastapi.sfeos_helpers.database import ( apply_free_text_filter_shared, apply_intersects_filter_shared, @@ -46,7 +47,6 @@ merge_to_operations, operations_to_script, ) -from stac_fastapi.sfeos_helpers import filter as filter_module from stac_fastapi.sfeos_helpers.mappings import ( AGGREGATION_MAPPING, COLLECTIONS_INDEX, From 60d517f6012ea48a11a01fc2b2de0c110468db38 Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Thu, 7 Aug 2025 17:55:51 +0200 Subject: [PATCH 04/11] fix in cicd.yml --- .github/workflows/cicd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index ae34d115..25865744 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -123,5 +123,6 @@ jobs: ES_PORT: ${{ matrix.backend == 'elasticsearch7' && '9400' || matrix.backend == 'elasticsearch8' && '9200' || '9202' }} ES_HOST: 172.17.0.1 ES_USE_SSL: false + DATABASE_REFRESH: true ES_VERIFY_CERTS: false BACKEND: ${{ matrix.backend == 'elasticsearch7' && 'elasticsearch' || matrix.backend == 'elasticsearch8' && 'elasticsearch' || 'opensearch' }} From f2da9c522afd2543ec7d28c31901c2c3361b66cd Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Thu, 7 Aug 2025 18:04:24 +0200 Subject: [PATCH 05/11] fix for openserach --- .../opensearch/stac_fastapi/opensearch/database_logic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index 0fc26c3d..830694c3 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -924,7 +924,7 @@ async def json_patch_item( await self.client.update( index=document_index, id=mk_item_id(item_id, collection_id), - script=script, + body={"script": script}, refresh=True, ) except exceptions.NotFoundError: From 6ba27fdebc9bb2931f435f52beb7c54dcbefc447 Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Mon, 11 Aug 2025 12:37:51 +0200 Subject: [PATCH 06/11] changelog, readme --- CHANGELOG.md | 26 +-- README.md | 618 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 631 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f820245..52ce5f2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,26 +11,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added - Added comprehensive index management system with dynamic selection and insertion strategies for improved performance and scalability [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) -- Added `ENABLE_DATETIME_INDEX_FILTERING` environment variable to enable datetime-based index selection using collection IDs. Requires indexes in format: `STAC_ITEMS_INDEX_PREFIX_collection-id_start_year-start_month-start_day-end_year-end_month-end_day`, e.g. `items_sentinel-2-l2a_2025-06-06-2025-09-22`. Default is `false`. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) -- Added `DATETIME_INDEX_MAX_SIZE_GB` environment variable to set maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Default is `25` GB. Only applies when `ENABLE_DATETIME_INDEX_FILTERING` is enabled. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) -- Added search engine adapter system with support for both Elasticsearch and OpenSearch [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): - - `SearchEngineAdapter` base class with engine-specific implementations - - `ElasticsearchAdapter` and `OpenSearchAdapter` with tailored index creation methods - - Automatic engine type detection based on client class - - `SearchEngineAdapterFactory` for creating appropriate adapters +- Added `ENABLE_DATETIME_INDEX_FILTERING` environment variable to enable datetime-based index selection using collection IDs. When enabled, the system creates indexes with UUID-based names and manages them through time-based aliases. Default is `false`. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added `DATETIME_INDEX_MAX_SIZE_GB` environment variable to set maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Note: add +20% to target size due to ES/OS compression. Default is `25` GB. Only applies when `ENABLE_DATETIME_INDEX_FILTERING` is enabled. [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405) +- Added index operations system with unified interface for both Elasticsearch and OpenSearch [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): + - `IndexOperations` class with common index creation and management methods + - UUID-based physical index naming: `{prefix}_{collection-id}_{uuid4}` + - Alias management: main collection alias, temporal aliases, and closed index aliases + - Automatic alias updates when indexes reach size limits - Added datetime-based index selection strategies with caching support [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): - - `AsyncDatetimeBasedIndexSelector` and `SyncDatetimeBasedIndexSelector` for temporal filtering + - `DatetimeBasedIndexSelector` for temporal filtering with intelligent caching - `IndexCacheManager` with configurable TTL-based cache expiration (default 1 hour) - - `AsyncIndexAliasLoader` and `SyncIndexAliasLoader` for alias management + - `IndexAliasLoader` for alias management and cache refresh - `UnfilteredIndexSelector` as fallback for returning all available indexes - Added index insertion strategies with automatic partitioning [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): - - Simple insertion strategy (`AsyncSimpleIndexInserter`, `SyncSimpleIndexInserter`) for traditional single-index-per-collection approach - - Datetime-based insertion strategy (`AsyncDatetimeIndexInserter`, `SyncDatetimeIndexInserter`) with time-based partitioning + - Simple insertion strategy (`SimpleIndexInserter`) for traditional single-index-per-collection approach + - Datetime-based insertion strategy (`DatetimeIndexInserter`) with time-based partitioning - Automatic index size monitoring and splitting when limits exceeded - Handling of chronologically early data and bulk operations - Added index management utilities [#405](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/405): - - `IndexSizeManager` for size monitoring and overflow handling - - `DatetimeIndexManager` for datetime-based index operations + - `IndexSizeManager` for size monitoring and overflow handling with compression awareness + - `DatetimeIndexManager` for datetime-based index operations and validation - Factory patterns (`IndexInsertionFactory`, `IndexSelectorFactory`) for strategy creation based on configuration diff --git a/README.md b/README.md index 9ba41ed8..a0934b45 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,233 @@ There are two main ways to run the API locally: +## Configuration Reference + +You can customize additional settings in your `.env` file: + +| Variable | Description | Default | Required | +|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|---------------------------------------------------------------------------------------------| +| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | +| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS) | Optional | +| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `false` | Optional | +| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `false` | Optional | +| `ES_TIMEOUT` | Client timeout for Elasticsearch/OpenSearch. | DB client default | Optional | +| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | +| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | +| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | +| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | +| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | +| `APP_PORT` | Server port. | `8080` | Optional | +| `ENVIRONMENT` | Runtime environment. | `local` | Optional | +| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | +| `RELOAD` | Enable auto-reload for development. | `true` | Optional | +| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | +| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | +| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | | +| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional +| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional +| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` Optional | +| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | +| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | +| `ENABLE_DATETIME_INDEX_FILTERING` | Enable datetime-ba# stac-fastapi-elasticsearch-opensearch + + + + +

+ +

+ +**Jump to:** [Project Introduction](#project-introduction---what-is-sfeos) | [Quick Start](#quick-start) | [Table of Contents](#table-of-contents) + + [![Downloads](https://static.pepy.tech/badge/stac-fastapi-core?color=blue)](https://pepy.tech/project/stac-fastapi-core) + [![GitHub contributors](https://img.shields.io/github/contributors/stac-utils/stac-fastapi-elasticsearch-opensearch?color=blue)](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/graphs/contributors) + [![GitHub stars](https://img.shields.io/github/stars/stac-utils/stac-fastapi-elasticsearch-opensearch.svg?color=blue)](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/stargazers) + [![GitHub forks](https://img.shields.io/github/forks/stac-utils/stac-fastapi-elasticsearch-opensearch.svg?color=blue)](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/network/members) + [![PyPI version](https://img.shields.io/pypi/v/stac-fastapi-elasticsearch.svg?color=blue)](https://pypi.org/project/stac-fastapi-elasticsearch/) + [![STAC](https://img.shields.io/badge/STAC-1.1.0-blue.svg)](https://github.com/radiantearth/stac-spec/tree/v1.1.0) + [![stac-fastapi](https://img.shields.io/badge/stac--fastapi-6.0.0-blue.svg)](https://github.com/stac-utils/stac-fastapi) + +## Sponsors & Supporters + +The following organizations have contributed time and/or funding to support the development of this project: + +

+ Healy Hyperspatial + Atomic Maps + VITO Remote Sensing +

+ +## Project Introduction - What is SFEOS? + +SFEOS (stac-fastapi-elasticsearch-opensearch) is a high-performance, scalable API implementation for serving SpatioTemporal Asset Catalog (STAC) data - an enhanced GeoJSON format designed specifically for geospatial assets like satellite imagery, aerial photography, and other Earth observation data. This project enables organizations to: + +- **Efficiently catalog and search geospatial data** such as satellite imagery, aerial photography, DEMs, and other geospatial assets using Elasticsearch or OpenSearch as the database backend +- **Implement standardized STAC APIs** that support complex spatial, temporal, and property-based queries across large collections of geospatial data +- **Scale to millions of geospatial assets** with fast search performance through optimized spatial indexing and query capabilities +- **Support OGC-compliant filtering** including spatial operations (intersects, contains, etc.) and temporal queries +- **Perform geospatial aggregations** to analyze data distribution across space and time + +This implementation builds on the STAC-FastAPI framework, providing a production-ready solution specifically optimized for Elasticsearch and OpenSearch databases. It's ideal for organizations managing large geospatial data catalogs who need efficient discovery and access capabilities through standardized APIs. + + + +## Common Deployment Patterns + +stac-fastapi-elasticsearch-opensearch can be deployed in several ways depending on your needs: + +- **Containerized Application**: Run as a Docker container with connections to Elasticsearch/OpenSearch databases +- **Serverless Function**: Deploy as AWS Lambda or similar serverless function with API Gateway +- **Traditional Server**: Run on virtual machines or bare metal servers in your infrastructure +- **Kubernetes**: Deploy as part of a larger microservices architecture with container orchestration + +The implementation is flexible and can scale from small local deployments to large production environments serving millions of geospatial assets. + +## Technologies + +This project is built on the following technologies: STAC, stac-fastapi, FastAPI, Elasticsearch, Python, OpenSearch + +

+ STAC + Python + FastAPI + Elasticsearch + OpenSearch +

+ +## Table of Contents + +- [Documentation & Resources](#documentation--resources) +- [Package Structure](#package-structure) +- [Examples](#examples) +- [Performance](#performance) +- [Quick Start](#quick-start) + - [Installation](#installation) + - [Running Locally](#running-locally) +- [Configuration reference](#configuration-reference) +- [Interacting with the API](#interacting-with-the-api) +- [Configure the API](#configure-the-api) +- [Collection pagination](#collection-pagination) +- [Ingesting Sample Data CLI Tool](#ingesting-sample-data-cli-tool) +- [Elasticsearch Mappings](#elasticsearch-mappings) +- [Managing Elasticsearch Indices](#managing-elasticsearch-indices) + - [Snapshots](#snapshots) + - [Reindexing](#reindexing) +- [Auth](#auth) +- [Aggregation](#aggregation) +- [Rate Limiting](#rate-limiting) + +## Documentation & Resources + +- **Online Documentation**: [https://stac-utils.github.io/stac-fastapi-elasticsearch-opensearch](https://stac-utils.github.io/stac-fastapi-elasticsearch-opensearch/) +- **Source Code**: [https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch) +- **API Examples**: [Postman Documentation](https://documenter.getpostman.com/view/12888943/2s8ZDSdRHA) - Examples of how to use the API endpoints +- **Community**: + - [Gitter Chat](https://app.gitter.im/#/room/#stac-fastapi-elasticsearch_community:gitter.im) - For real-time discussions + - [GitHub Discussions](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/discussions) - For longer-form questions and answers + +## Package Structure + +This project is organized into several packages, each with a specific purpose: + +- **stac_fastapi_core**: Core functionality that's database-agnostic, including API models, extensions, and shared utilities. This package provides the foundation for building STAC API implementations with any database backend. See [stac-fastapi-mongo](https://github.com/Healy-Hyperspatial/stac-fastapi-mongo) for a working example. + +- **sfeos_helpers**: Shared helper functions and utilities used by both the Elasticsearch and OpenSearch backends. This package includes: + - `database`: Specialized modules for index, document, and database utility operations + - `aggregation`: Elasticsearch/OpenSearch-specific aggregation functionality + - Shared logic and utilities that improve code reuse between backends + +- **stac_fastapi_elasticsearch**: Complete implementation of the STAC API using Elasticsearch as the backend database. This package depends on both `stac_fastapi_core` and `sfeos_helpers`. +- +- **stac_fastapi_opensearch**: Complete implementation of the STAC API using OpenSearch as the backend database. This package depends on both `stac_fastapi_core` and `sfeos_helpers`. + +## Examples + +The `/examples` directory contains several useful examples and reference implementations: + +- **pip_docker**: Examples of running stac-fastapi-elasticsearch from PyPI in Docker without needing any code from the repository +- **auth**: Authentication examples including: + - Basic authentication + - OAuth2 with Keycloak + - Route dependencies configuration +- **rate_limit**: Example of implementing rate limiting for API requests +- **postman_collections**: Postman collection files you can import for testing API endpoints + +These examples provide practical reference implementations for various deployment scenarios and features. + +## Performance + +### Direct Response Mode + +- The `enable_direct_response` option is provided by the stac-fastapi core library (introduced in stac-fastapi 5.2.0) and is available in this project starting from v4.0.0. +- **Control via environment variable**: Set `ENABLE_DIRECT_RESPONSE=true` to enable this feature. +- **How it works**: When enabled, endpoints return Starlette Response objects directly, bypassing FastAPI's default serialization for improved performance. +- **Important limitation**: All FastAPI dependencies (including authentication, custom status codes, and validation) are disabled for all routes when this mode is enabled. +- **Best use case**: This mode is best suited for public or read-only APIs where authentication and custom logic are not required. +- **Default setting**: `false` for safety. +- **More information**: See [issue #347](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/issues/347) for background and implementation details. + +## Quick Start + +This section helps you get up and running with stac-fastapi-elasticsearch-opensearch quickly. + +### Installation + +- **For versions 4.0.0a1 and newer** (PEP 625 compliant naming): + ```bash + pip install stac-fastapi-elasticsearch # Elasticsearch backend + pip install stac-fastapi-opensearch # Opensearch backend + pip install stac-fastapi-core # Core library + ``` + +- **For versions 4.0.0a0 and older**: + ```bash + pip install stac-fastapi.elasticsearch # Elasticsearch backend + pip install stac-fastapi.opensearch # Opensearch backend + pip install stac-fastapi.core # Core library + ``` + +> **Important Note:** Starting with version 4.0.0a1, package names have changed from using periods (e.g., `stac-fastapi.core`) to using hyphens (e.g., `stac-fastapi-core`) to comply with PEP 625. The internal package structure uses underscores, but users should install with hyphens as shown above. Please update your requirements files accordingly. + +### Running Locally + +There are two main ways to run the API locally: + +#### Using Pre-built Docker Images + +- We provide ready-to-use Docker images through GitHub Container Registry: + - [ElasticSearch backend](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pkgs/container/stac-fastapi-es) + - [OpenSearch backend](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pkgs/container/stac-fastapi-os) + +- **Pull and run the images**: + ```shell + # For Elasticsearch backend + docker pull ghcr.io/stac-utils/stac-fastapi-es:latest + + # For OpenSearch backend + docker pull ghcr.io/stac-utils/stac-fastapi-os:latest + ``` + +#### Using Docker Compose + +- **Prerequisites**: Ensure [Docker Compose](https://docs.docker.com/compose/install/) or [Podman Compose](https://podman-desktop.io/docs/compose) is installed on your machine. + +- **Start the API**: + ```shell + docker compose up elasticsearch app-elasticsearch + ``` + +- **Configuration**: By default, Docker Compose uses Elasticsearch 8.x and OpenSearch 2.11.1. To use different versions, create a `.env` file: + ```shell + ELASTICSEARCH_VERSION=8.11.0 + OPENSEARCH_VERSION=2.11.1 + ENABLE_DIRECT_RESPONSE=false + ``` + +- **Compatibility**: The most recent Elasticsearch 7.x versions should also work. See the [opensearch-py docs](https://github.com/opensearch-project/opensearch-py/blob/main/COMPATIBILITY.md) for compatibility information. + + + ## Configuration Reference You can customize additional settings in your `.env` file: @@ -231,6 +458,397 @@ You can customize additional settings in your `.env` file: > [!NOTE] > The variables `ES_HOST`, `ES_PORT`, `ES_USE_SSL`, `ES_VERIFY_CERTS` and `ES_TIMEOUT` apply to both Elasticsearch and OpenSearch backends, so there is no need to rename the key names to `OS_` even if you're using OpenSearch. + +# Datetime-Based Index Management + +## Overview + +SFEOS supports two indexing strategies for managing STAC items: + +1. **Simple Indexing** (default) - One index per collection +2. **Datetime-Based Indexing** - Time-partitioned indexes with automatic management + +The datetime-based indexing strategy is particularly useful for large temporal datasets. When a user provides a datetime parameter in a query, the system knows exactly which index to search, providing **multiple times faster searches** and significantly **reducing database load**. + +## When to Use + +**Recommended for:** +- Systems with large collections containing millions of items +- Systems requiring high-performance temporal searching + +**Pros:** +- Multiple times faster queries with datetime filter +- Reduced database load - only relevant indexes are searched + +**Cons:** +- Slightly longer item indexing time (automatic index management) +- Greater management complexity + +## Configuration + +### Enabling Datetime-Based Indexing + +Enable datetime-based indexing by setting the following environment variable: + +```bash +ENABLE_DATETIME_INDEX_FILTERING=true +``` + +### Related Configuration Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `ENABLE_DATETIME_INDEX_FILTERING` | Enables time-based index partitioning | `false` | `true` | +| `DATETIME_INDEX_MAX_SIZE_GB` | Maximum size limit for datetime indexes (GB) - note: add +20% to target size due to ES/OS compression | `25` | `50` | +| `STAC_ITEMS_INDEX_PREFIX` | Prefix for item indexes | `items_` | `stac_items_` | + +## How Datetime-Based Indexing Works + +### Index and Alias Naming Convention + +The system uses a precise naming convention: + +**Physical indexes:** +``` +{ITEMS_INDEX_PREFIX}{collection-id}_{uuid4} +``` + +**Aliases:** +``` +{ITEMS_INDEX_PREFIX}{collection-id} # Main collection alias +{ITEMS_INDEX_PREFIX}{collection-id}_{start-datetime} # Temporal alias +{ITEMS_INDEX_PREFIX}{collection-id}_{start-datetime}_{end-datetime} # Closed index alias +``` + +**Example:** + +*Physical indexes:* +- `items_sentinel-2-l2a_a1b2c3d4-e5f6-7890-abcd-ef1234567890` + +*Aliases:* +- `items_sentinel-2-l2a` - main collection alias +- `items_sentinel-2-l2a_2024-01-01` - active alias from January 1, 2024 +- `items_sentinel-2-l2a_2024-01-01_2024-03-15` - closed index alias (reached size limit) + +### Index Size Management + +**Important - Data Compression:** Elasticsearch and OpenSearch automatically compress data. The configured `DATETIME_INDEX_MAX_SIZE_GB` limit refers to the compressed size on disk. It is recommended to add +20% to the target size to account for compression overhead and metadata. + +## Interacting with the API + +- **Creating a Collection**: + ```shell + curl -X "POST" "http://localhost:8080/collections" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "id": "my_collection" + }' + ``` + +- **Adding an Item to a Collection**: + ```shell + curl -X "POST" "http://localhost:8080/collections/my_collection/items" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d @item.json + ``` + +- **Searching for Items**: + ```shell + curl -X "GET" "http://localhost:8080/search" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "collections": ["my_collection"], + "limit": 10 + }' + ``` + +- **Filtering by Bbox**: + ```shell + curl -X "GET" "http://localhost:8080/search" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "collections": ["my_collection"], + "bbox": [-180, -90, 180, 90] + }' + ``` + +- **Filtering by Datetime**: + ```shell + curl -X "GET" "http://localhost:8080/search" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "collections": ["my_collection"], + "datetime": "2020-01-01T00:00:00Z/2020-12-31T23:59:59Z" + }' + ``` + +## Configure the API + +- **API Title and Description**: By default set to `stac-fastapi-`. Customize these by setting: + - `STAC_FASTAPI_TITLE`: Changes the API title in the documentation + - `STAC_FASTAPI_DESCRIPTION`: Changes the API description in the documentation + +- **Database Indices**: By default, the API reads from and writes to: + - `collections` index for collections + - `items_` indices for items + - Customize with `STAC_COLLECTIONS_INDEX` and `STAC_ITEMS_INDEX_PREFIX` environment variables + +- **Root Path Configuration**: The application root path is the base URL by default. + - For AWS Lambda with Gateway API: Set `STAC_FASTAPI_ROOT_PATH` to match the Gateway API stage name (e.g., `/v1`) + + +## Collection Pagination + +- **Overview**: The collections route supports pagination through optional query parameters. +- **Parameters**: + - `limit`: Controls the number of collections returned per page + - `token`: Used to retrieve subsequent pages of results +- **Response Structure**: The `links` field in the response contains a `next` link with the token for the next page of results. +- **Example Usage**: + ```shell + curl -X "GET" "http://localhost:8080/collections?limit=1&token=example_token" + ``` + +## Ingesting Sample Data CLI Tool + +- **Overview**: The `data_loader.py` script provides a convenient way to load STAC items into the database. + +- **Usage**: + ```shell + python3 data_loader.py --base-url http://localhost:8080 + ``` + +- **Options**: + ``` + --base-url TEXT Base URL of the STAC API [required] + --collection-id TEXT ID of the collection to which items are added + --use-bulk Use bulk insert method for items + --data-dir PATH Directory containing collection.json and feature + collection file + --help Show this message and exit. + ``` + +- **Example Workflows**: + - **Loading Sample Data**: + ```shell + python3 data_loader.py --base-url http://localhost:8080 + ``` + - **Loading Data to a Specific Collection**: + ```shell + python3 data_loader.py --base-url http://localhost:8080 --collection-id my-collection + ``` + - **Using Bulk Insert for Performance**: + ```shell + python3 data_loader.py --base-url http://localhost:8080 --use-bulk + ``` + +## Elasticsearch Mappings + +- **Overview**: Mappings apply to search index, not source data. They define how documents and their fields are stored and indexed. +- **Implementation**: + - Mappings are stored in index templates that are created on application startup + - These templates are automatically applied when creating new Collection and Item indices + - The `sfeos_helpers` package contains shared mapping definitions used by both Elasticsearch and OpenSearch backends +- **Customization**: Custom mappings can be defined by extending the base mapping templates. + +## Managing Elasticsearch Indices + +### Snapshots + +- **Overview**: Snapshots provide a way to backup and restore your indices. + +- **Creating a Snapshot Repository**: + ```shell + curl -X "PUT" "http://localhost:9200/_snapshot/my_fs_backup" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "type": "fs", + "settings": { + "location": "/usr/share/elasticsearch/snapshots/my_fs_backup" + } + }' + ``` + - This creates a snapshot repository that stores files in the elasticsearch/snapshots directory in this git repo clone + - The elasticsearch.yml and compose files create a mapping from that directory to /usr/share/elasticsearch/snapshots within the Elasticsearch container and grant permissions for using it + +- **Creating a Snapshot**: + ```shell + curl -X "PUT" "http://localhost:9200/_snapshot/my_fs_backup/my_snapshot_2?wait_for_completion=true" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "metadata": { + "taken_because": "dump of all items", + "taken_by": "pvarner" + }, + "include_global_state": false, + "ignore_unavailable": false, + "indices": "items_my-collection" + }' + ``` + - This creates a snapshot named my_snapshot_2 and waits for the action to be completed before returning + - This can also be done asynchronously by omitting the wait_for_completion parameter, and queried for status later + - The indices parameter determines which indices are snapshotted, and can include wildcards + +- **Viewing Snapshots**: + ```shell + # View a specific snapshot + curl http://localhost:9200/_snapshot/my_fs_backup/my_snapshot_2 + + # View all snapshots + curl http://localhost:9200/_snapshot/my_fs_backup/_all + ``` + - These commands allow you to check the status and details of your snapshots + +- **Restoring a Snapshot**: + ```shell + curl -X "POST" "http://localhost:9200/_snapshot/my_fs_backup/my_snapshot_2/_restore?wait_for_completion=true" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "include_aliases": false, + "include_global_state": false, + "ignore_unavailable": true, + "rename_replacement": "items_$1-copy", + "indices": "items_*", + "rename_pattern": "items_(.+)" + }' + ``` + - This specific command will restore any indices that match items_* and rename them so that the new index name will be suffixed with -copy + - The rename_pattern and rename_replacement parameters allow you to restore indices under new names + +- **Updating Collection References**: + ```shell + curl -X "POST" "http://localhost:9200/items_my-collection-copy/_update_by_query" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d $'{ + "query": { + "match_all": {} + }, + "script": { + "lang": "painless", + "params": { + "collection": "my-collection-copy" + }, + "source": "ctx._source.collection = params.collection" + } + }' + ``` + - After restoring, the item documents have been restored in the new index (e.g., my-collection-copy), but the value of the collection field in those documents is still the original value of my-collection + - This command updates these values to match the new collection name using Elasticsearch's Update By Query feature + +- **Creating a New Collection**: + ```shell + curl -X "POST" "http://localhost:8080/collections" \ + -H 'Content-Type: application/json' \ + -d $'{ + "id": "my-collection-copy" + }' + ``` + - The final step is to create a new collection through the API with the new name for each of the restored indices + - This gives you a copy of the collection that has a resource URI (/collections/my-collection-copy) and can be correctly queried by collection name + +### Reindexing + +- **Overview**: Reindexing allows you to copy documents from one index to another, optionally transforming them in the process. + +- **Use Cases**: + - Apply changes to documents + - Correct dynamically generated mappings + - Transform data (e.g., lowercase identifiers) + - The index templates will make sure that manually created indices will also have the correct mappings and settings + +- **Example: Reindexing with Transformation**: + ```shell + curl -X "POST" "http://localhost:9200/_reindex" \ + -H 'Content-Type: application/json' \ + -d $'{ + "source": { + "index": "items_my-collection-lower_my-collection-hex-000001" + }, + "dest": { + "index": "items_my-collection-lower_my-collection-hex-000002" + }, + "script": { + "source": "ctx._source.id = ctx._source.id.toLowerCase()", + "lang": "painless" + } + }' + ``` + - In this example, we make a copy of an existing Item index but change the Item identifier to be lowercase + - The script parameter allows you to transform documents during the reindexing process + +- **Updating Aliases**: + ```shell + curl -X "POST" "http://localhost:9200/_aliases" \ + -H 'Content-Type: application/json' \ + -d $'{ + "actions": [ + { + "remove": { + "index": "*", + "alias": "items_my-collection" + } + }, + { + "add": { + "index": "items_my-collection-lower_my-collection-hex-000002", + "alias": "items_my-collection" + } + } + ] + }' + ``` + - If you are happy with the data in the newly created index, you can move the alias items_my-collection to the new index + - This makes the modified Items with lowercase identifiers visible to users accessing my-collection in the STAC API + - Using aliases allows you to switch between different index versions without changing the API endpoint + +## Auth + +- **Overview**: Authentication is an optional feature that can be enabled through Route Dependencies. +- **Implementation Options**: + - Basic authentication + - OAuth2 with Keycloak + - Custom route dependencies +- **Configuration**: Authentication can be configured using the `STAC_FASTAPI_ROUTE_DEPENDENCIES` environment variable. +- **Examples and Documentation**: Detailed examples and implementation guides can be found in the [examples/auth](examples/auth) directory. + +## Aggregation + +- **Supported Aggregations**: + - Spatial aggregations of points and geometries + - Frequency distribution aggregation of any property including dates + - Temporal distribution of datetime values + +- **Endpoint Locations**: + - Root Catalog level: `/aggregations` + - Collection level: `//aggregations` + +- **Implementation Details**: The `sfeos_helpers.aggregation` package provides specialized functionality for both Elasticsearch and OpenSearch backends. + +- **Documentation**: Detailed information about supported aggregations can be found in [the aggregation docs](./docs/src/aggregation.md). + + +## Rate Limiting + +- **Overview**: Rate limiting is an optional security feature that controls API request frequency on a remote address basis. + +- **Configuration**: Enabled by setting the `STAC_FASTAPI_RATE_LIMIT` environment variable: + ``` + STAC_FASTAPI_RATE_LIMIT=500/minute + ``` + +- **Functionality**: + - Limits each client to a specified number of requests per time period (e.g., 500 requests per minute) + - Helps prevent API abuse and maintains system stability + - Ensures fair resource allocation among all clients + +- **Examples**: Implementation examples are available in the [examples/rate_limit](examples/rate_limit) directory. + +sed index selection using collection IDs. Requires indexes in format: STAC_ITEMS_INDEX_PREFIX_collection-id_start_year-start_month-start_day-end_year-end_month-end_day, e.g. items_sentinel-2-l2a_2025-06-06-2025-09-22. | `false` | Optional | +| `DATETIME_INDEX_MAX_SIZE_GB` | Maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Note: This value should account for ~25% overhead due to OS/ES caching of data structures and metadata. Only applies when`ENABLE_DATETIME_INDEX_FILTERING` is enabled. | `25` | Optional | + +> [!NOTE] +> The variables `ES_HOST`, `ES_PORT`, `ES_USE_SSL`, `ES_VERIFY_CERTS` and `ES_TIMEOUT` apply to both Elasticsearch and OpenSearch backends, so there is no need to rename the key names to `OS_` even if you're using OpenSearch. + ## Interacting with the API - **Creating a Collection**: From 3159e05d790816deb8cc6e0b5470d9d99d3676b4 Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Mon, 11 Aug 2025 12:52:38 +0200 Subject: [PATCH 07/11] fix --- README.md | 593 +++--------------------------------------------------- 1 file changed, 24 insertions(+), 569 deletions(-) diff --git a/README.md b/README.md index a0934b45..977a351a 100644 --- a/README.md +++ b/README.md @@ -201,264 +201,35 @@ There are two main ways to run the API locally: You can customize additional settings in your `.env` file: -| Variable | Description | Default | Required | -|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|---------------------------------------------------------------------------------------------| -| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | -| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS) | Optional | -| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `false` | Optional | -| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `false` | Optional | +| Variable | Description | Default | Required | +|------------------------------|--------------------------------------------------------------------------------------|--------------------------|---------------------------------------------------------------------------------------------| +| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | +| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS)| Optional | +| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `true` | Optional | +| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `true` | Optional | +| `ES_API_KEY` | API Key for external Elasticsearch/OpenSearch. | N/A | Optional | | `ES_TIMEOUT` | Client timeout for Elasticsearch/OpenSearch. | DB client default | Optional | -| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | -| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | -| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | -| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | -| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | -| `APP_PORT` | Server port. | `8080` | Optional | -| `ENVIRONMENT` | Runtime environment. | `local` | Optional | -| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | -| `RELOAD` | Enable auto-reload for development. | `true` | Optional | -| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | -| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | -| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | | -| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional -| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional -| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` Optional | -| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | -| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | -| `ENABLE_DATETIME_INDEX_FILTERING` | Enable datetime-ba# stac-fastapi-elasticsearch-opensearch - - - - -

- -

- -**Jump to:** [Project Introduction](#project-introduction---what-is-sfeos) | [Quick Start](#quick-start) | [Table of Contents](#table-of-contents) - - [![Downloads](https://static.pepy.tech/badge/stac-fastapi-core?color=blue)](https://pepy.tech/project/stac-fastapi-core) - [![GitHub contributors](https://img.shields.io/github/contributors/stac-utils/stac-fastapi-elasticsearch-opensearch?color=blue)](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/graphs/contributors) - [![GitHub stars](https://img.shields.io/github/stars/stac-utils/stac-fastapi-elasticsearch-opensearch.svg?color=blue)](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/stargazers) - [![GitHub forks](https://img.shields.io/github/forks/stac-utils/stac-fastapi-elasticsearch-opensearch.svg?color=blue)](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/network/members) - [![PyPI version](https://img.shields.io/pypi/v/stac-fastapi-elasticsearch.svg?color=blue)](https://pypi.org/project/stac-fastapi-elasticsearch/) - [![STAC](https://img.shields.io/badge/STAC-1.1.0-blue.svg)](https://github.com/radiantearth/stac-spec/tree/v1.1.0) - [![stac-fastapi](https://img.shields.io/badge/stac--fastapi-6.0.0-blue.svg)](https://github.com/stac-utils/stac-fastapi) - -## Sponsors & Supporters - -The following organizations have contributed time and/or funding to support the development of this project: - -

- Healy Hyperspatial - Atomic Maps - VITO Remote Sensing -

- -## Project Introduction - What is SFEOS? - -SFEOS (stac-fastapi-elasticsearch-opensearch) is a high-performance, scalable API implementation for serving SpatioTemporal Asset Catalog (STAC) data - an enhanced GeoJSON format designed specifically for geospatial assets like satellite imagery, aerial photography, and other Earth observation data. This project enables organizations to: - -- **Efficiently catalog and search geospatial data** such as satellite imagery, aerial photography, DEMs, and other geospatial assets using Elasticsearch or OpenSearch as the database backend -- **Implement standardized STAC APIs** that support complex spatial, temporal, and property-based queries across large collections of geospatial data -- **Scale to millions of geospatial assets** with fast search performance through optimized spatial indexing and query capabilities -- **Support OGC-compliant filtering** including spatial operations (intersects, contains, etc.) and temporal queries -- **Perform geospatial aggregations** to analyze data distribution across space and time - -This implementation builds on the STAC-FastAPI framework, providing a production-ready solution specifically optimized for Elasticsearch and OpenSearch databases. It's ideal for organizations managing large geospatial data catalogs who need efficient discovery and access capabilities through standardized APIs. - - - -## Common Deployment Patterns - -stac-fastapi-elasticsearch-opensearch can be deployed in several ways depending on your needs: - -- **Containerized Application**: Run as a Docker container with connections to Elasticsearch/OpenSearch databases -- **Serverless Function**: Deploy as AWS Lambda or similar serverless function with API Gateway -- **Traditional Server**: Run on virtual machines or bare metal servers in your infrastructure -- **Kubernetes**: Deploy as part of a larger microservices architecture with container orchestration - -The implementation is flexible and can scale from small local deployments to large production environments serving millions of geospatial assets. - -## Technologies - -This project is built on the following technologies: STAC, stac-fastapi, FastAPI, Elasticsearch, Python, OpenSearch - -

- STAC - Python - FastAPI - Elasticsearch - OpenSearch -

- -## Table of Contents - -- [Documentation & Resources](#documentation--resources) -- [Package Structure](#package-structure) -- [Examples](#examples) -- [Performance](#performance) -- [Quick Start](#quick-start) - - [Installation](#installation) - - [Running Locally](#running-locally) -- [Configuration reference](#configuration-reference) -- [Interacting with the API](#interacting-with-the-api) -- [Configure the API](#configure-the-api) -- [Collection pagination](#collection-pagination) -- [Ingesting Sample Data CLI Tool](#ingesting-sample-data-cli-tool) -- [Elasticsearch Mappings](#elasticsearch-mappings) -- [Managing Elasticsearch Indices](#managing-elasticsearch-indices) - - [Snapshots](#snapshots) - - [Reindexing](#reindexing) -- [Auth](#auth) -- [Aggregation](#aggregation) -- [Rate Limiting](#rate-limiting) - -## Documentation & Resources - -- **Online Documentation**: [https://stac-utils.github.io/stac-fastapi-elasticsearch-opensearch](https://stac-utils.github.io/stac-fastapi-elasticsearch-opensearch/) -- **Source Code**: [https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch) -- **API Examples**: [Postman Documentation](https://documenter.getpostman.com/view/12888943/2s8ZDSdRHA) - Examples of how to use the API endpoints -- **Community**: - - [Gitter Chat](https://app.gitter.im/#/room/#stac-fastapi-elasticsearch_community:gitter.im) - For real-time discussions - - [GitHub Discussions](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/discussions) - For longer-form questions and answers - -## Package Structure - -This project is organized into several packages, each with a specific purpose: - -- **stac_fastapi_core**: Core functionality that's database-agnostic, including API models, extensions, and shared utilities. This package provides the foundation for building STAC API implementations with any database backend. See [stac-fastapi-mongo](https://github.com/Healy-Hyperspatial/stac-fastapi-mongo) for a working example. - -- **sfeos_helpers**: Shared helper functions and utilities used by both the Elasticsearch and OpenSearch backends. This package includes: - - `database`: Specialized modules for index, document, and database utility operations - - `aggregation`: Elasticsearch/OpenSearch-specific aggregation functionality - - Shared logic and utilities that improve code reuse between backends - -- **stac_fastapi_elasticsearch**: Complete implementation of the STAC API using Elasticsearch as the backend database. This package depends on both `stac_fastapi_core` and `sfeos_helpers`. -- -- **stac_fastapi_opensearch**: Complete implementation of the STAC API using OpenSearch as the backend database. This package depends on both `stac_fastapi_core` and `sfeos_helpers`. - -## Examples - -The `/examples` directory contains several useful examples and reference implementations: - -- **pip_docker**: Examples of running stac-fastapi-elasticsearch from PyPI in Docker without needing any code from the repository -- **auth**: Authentication examples including: - - Basic authentication - - OAuth2 with Keycloak - - Route dependencies configuration -- **rate_limit**: Example of implementing rate limiting for API requests -- **postman_collections**: Postman collection files you can import for testing API endpoints - -These examples provide practical reference implementations for various deployment scenarios and features. - -## Performance - -### Direct Response Mode - -- The `enable_direct_response` option is provided by the stac-fastapi core library (introduced in stac-fastapi 5.2.0) and is available in this project starting from v4.0.0. -- **Control via environment variable**: Set `ENABLE_DIRECT_RESPONSE=true` to enable this feature. -- **How it works**: When enabled, endpoints return Starlette Response objects directly, bypassing FastAPI's default serialization for improved performance. -- **Important limitation**: All FastAPI dependencies (including authentication, custom status codes, and validation) are disabled for all routes when this mode is enabled. -- **Best use case**: This mode is best suited for public or read-only APIs where authentication and custom logic are not required. -- **Default setting**: `false` for safety. -- **More information**: See [issue #347](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/issues/347) for background and implementation details. - -## Quick Start - -This section helps you get up and running with stac-fastapi-elasticsearch-opensearch quickly. - -### Installation - -- **For versions 4.0.0a1 and newer** (PEP 625 compliant naming): - ```bash - pip install stac-fastapi-elasticsearch # Elasticsearch backend - pip install stac-fastapi-opensearch # Opensearch backend - pip install stac-fastapi-core # Core library - ``` - -- **For versions 4.0.0a0 and older**: - ```bash - pip install stac-fastapi.elasticsearch # Elasticsearch backend - pip install stac-fastapi.opensearch # Opensearch backend - pip install stac-fastapi.core # Core library - ``` - -> **Important Note:** Starting with version 4.0.0a1, package names have changed from using periods (e.g., `stac-fastapi.core`) to using hyphens (e.g., `stac-fastapi-core`) to comply with PEP 625. The internal package structure uses underscores, but users should install with hyphens as shown above. Please update your requirements files accordingly. - -### Running Locally - -There are two main ways to run the API locally: - -#### Using Pre-built Docker Images - -- We provide ready-to-use Docker images through GitHub Container Registry: - - [ElasticSearch backend](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pkgs/container/stac-fastapi-es) - - [OpenSearch backend](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pkgs/container/stac-fastapi-os) - -- **Pull and run the images**: - ```shell - # For Elasticsearch backend - docker pull ghcr.io/stac-utils/stac-fastapi-es:latest - - # For OpenSearch backend - docker pull ghcr.io/stac-utils/stac-fastapi-os:latest - ``` - -#### Using Docker Compose - -- **Prerequisites**: Ensure [Docker Compose](https://docs.docker.com/compose/install/) or [Podman Compose](https://podman-desktop.io/docs/compose) is installed on your machine. - -- **Start the API**: - ```shell - docker compose up elasticsearch app-elasticsearch - ``` - -- **Configuration**: By default, Docker Compose uses Elasticsearch 8.x and OpenSearch 2.11.1. To use different versions, create a `.env` file: - ```shell - ELASTICSEARCH_VERSION=8.11.0 - OPENSEARCH_VERSION=2.11.1 - ENABLE_DIRECT_RESPONSE=false - ``` - -- **Compatibility**: The most recent Elasticsearch 7.x versions should also work. See the [opensearch-py docs](https://github.com/opensearch-project/opensearch-py/blob/main/COMPATIBILITY.md) for compatibility information. - - - -## Configuration Reference - -You can customize additional settings in your `.env` file: - -| Variable | Description | Default | Required | -|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|---------------------------------------------------------------------------------------------| -| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | -| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS) | Optional | -| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `false` | Optional | -| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `false` | Optional | -| `ES_TIMEOUT` | Client timeout for Elasticsearch/OpenSearch. | DB client default | Optional | -| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | -| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | -| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | -| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | -| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | -| `APP_PORT` | Server port. | `8080` | Optional | -| `ENVIRONMENT` | Runtime environment. | `local` | Optional | -| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | -| `RELOAD` | Enable auto-reload for development. | `true` | Optional | -| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | -| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | -| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | | -| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional -| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional -| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` Optional | -| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | -| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | -| `ENABLE_DATETIME_INDEX_FILTERING` | Enable datetime-based index selection using collection IDs. Requires indexes in format: STAC_ITEMS_INDEX_PREFIX_collection-id_start_year-start_month-start_day-end_year-end_month-end_day, e.g. items_sentinel-2-l2a_2025-06-06-2025-09-22. | `false` | Optional | -| `DATETIME_INDEX_MAX_SIZE_GB` | Maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Note: This value should account for ~25% overhead due to OS/ES caching of data structures and metadata. Only applies when`ENABLE_DATETIME_INDEX_FILTERING` is enabled. | `25` | Optional | +| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | +| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | +| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | +| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | +| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | +| `APP_PORT` | Server port. | `8000` | Optional | +| `ENVIRONMENT` | Runtime environment. | `local` | Optional | +| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | +| `RELOAD` | Enable auto-reload for development. | `true` | Optional | +| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | +| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | +| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | +| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional | +| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional | +| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` | Optional | +| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | +| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | > [!NOTE] > The variables `ES_HOST`, `ES_PORT`, `ES_USE_SSL`, `ES_VERIFY_CERTS` and `ES_TIMEOUT` apply to both Elasticsearch and OpenSearch backends, so there is no need to rename the key names to `OS_` even if you're using OpenSearch. - # Datetime-Based Index Management ## Overview @@ -842,319 +613,3 @@ The system uses a precise naming convention: - Ensures fair resource allocation among all clients - **Examples**: Implementation examples are available in the [examples/rate_limit](examples/rate_limit) directory. - -sed index selection using collection IDs. Requires indexes in format: STAC_ITEMS_INDEX_PREFIX_collection-id_start_year-start_month-start_day-end_year-end_month-end_day, e.g. items_sentinel-2-l2a_2025-06-06-2025-09-22. | `false` | Optional | -| `DATETIME_INDEX_MAX_SIZE_GB` | Maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Note: This value should account for ~25% overhead due to OS/ES caching of data structures and metadata. Only applies when`ENABLE_DATETIME_INDEX_FILTERING` is enabled. | `25` | Optional | - -> [!NOTE] -> The variables `ES_HOST`, `ES_PORT`, `ES_USE_SSL`, `ES_VERIFY_CERTS` and `ES_TIMEOUT` apply to both Elasticsearch and OpenSearch backends, so there is no need to rename the key names to `OS_` even if you're using OpenSearch. - -## Interacting with the API - -- **Creating a Collection**: - ```shell - curl -X "POST" "http://localhost:8080/collections" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "id": "my_collection" - }' - ``` - -- **Adding an Item to a Collection**: - ```shell - curl -X "POST" "http://localhost:8080/collections/my_collection/items" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d @item.json - ``` - -- **Searching for Items**: - ```shell - curl -X "GET" "http://localhost:8080/search" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "collections": ["my_collection"], - "limit": 10 - }' - ``` - -- **Filtering by Bbox**: - ```shell - curl -X "GET" "http://localhost:8080/search" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "collections": ["my_collection"], - "bbox": [-180, -90, 180, 90] - }' - ``` - -- **Filtering by Datetime**: - ```shell - curl -X "GET" "http://localhost:8080/search" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "collections": ["my_collection"], - "datetime": "2020-01-01T00:00:00Z/2020-12-31T23:59:59Z" - }' - ``` - -## Configure the API - -- **API Title and Description**: By default set to `stac-fastapi-`. Customize these by setting: - - `STAC_FASTAPI_TITLE`: Changes the API title in the documentation - - `STAC_FASTAPI_DESCRIPTION`: Changes the API description in the documentation - -- **Database Indices**: By default, the API reads from and writes to: - - `collections` index for collections - - `items_` indices for items - - Customize with `STAC_COLLECTIONS_INDEX` and `STAC_ITEMS_INDEX_PREFIX` environment variables - -- **Root Path Configuration**: The application root path is the base URL by default. - - For AWS Lambda with Gateway API: Set `STAC_FASTAPI_ROOT_PATH` to match the Gateway API stage name (e.g., `/v1`) - - -## Collection Pagination - -- **Overview**: The collections route supports pagination through optional query parameters. -- **Parameters**: - - `limit`: Controls the number of collections returned per page - - `token`: Used to retrieve subsequent pages of results -- **Response Structure**: The `links` field in the response contains a `next` link with the token for the next page of results. -- **Example Usage**: - ```shell - curl -X "GET" "http://localhost:8080/collections?limit=1&token=example_token" - ``` - -## Ingesting Sample Data CLI Tool - -- **Overview**: The `data_loader.py` script provides a convenient way to load STAC items into the database. - -- **Usage**: - ```shell - python3 data_loader.py --base-url http://localhost:8080 - ``` - -- **Options**: - ``` - --base-url TEXT Base URL of the STAC API [required] - --collection-id TEXT ID of the collection to which items are added - --use-bulk Use bulk insert method for items - --data-dir PATH Directory containing collection.json and feature - collection file - --help Show this message and exit. - ``` - -- **Example Workflows**: - - **Loading Sample Data**: - ```shell - python3 data_loader.py --base-url http://localhost:8080 - ``` - - **Loading Data to a Specific Collection**: - ```shell - python3 data_loader.py --base-url http://localhost:8080 --collection-id my-collection - ``` - - **Using Bulk Insert for Performance**: - ```shell - python3 data_loader.py --base-url http://localhost:8080 --use-bulk - ``` - -## Elasticsearch Mappings - -- **Overview**: Mappings apply to search index, not source data. They define how documents and their fields are stored and indexed. -- **Implementation**: - - Mappings are stored in index templates that are created on application startup - - These templates are automatically applied when creating new Collection and Item indices - - The `sfeos_helpers` package contains shared mapping definitions used by both Elasticsearch and OpenSearch backends -- **Customization**: Custom mappings can be defined by extending the base mapping templates. - -## Managing Elasticsearch Indices - -### Snapshots - -- **Overview**: Snapshots provide a way to backup and restore your indices. - -- **Creating a Snapshot Repository**: - ```shell - curl -X "PUT" "http://localhost:9200/_snapshot/my_fs_backup" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "type": "fs", - "settings": { - "location": "/usr/share/elasticsearch/snapshots/my_fs_backup" - } - }' - ``` - - This creates a snapshot repository that stores files in the elasticsearch/snapshots directory in this git repo clone - - The elasticsearch.yml and compose files create a mapping from that directory to /usr/share/elasticsearch/snapshots within the Elasticsearch container and grant permissions for using it - -- **Creating a Snapshot**: - ```shell - curl -X "PUT" "http://localhost:9200/_snapshot/my_fs_backup/my_snapshot_2?wait_for_completion=true" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "metadata": { - "taken_because": "dump of all items", - "taken_by": "pvarner" - }, - "include_global_state": false, - "ignore_unavailable": false, - "indices": "items_my-collection" - }' - ``` - - This creates a snapshot named my_snapshot_2 and waits for the action to be completed before returning - - This can also be done asynchronously by omitting the wait_for_completion parameter, and queried for status later - - The indices parameter determines which indices are snapshotted, and can include wildcards - -- **Viewing Snapshots**: - ```shell - # View a specific snapshot - curl http://localhost:9200/_snapshot/my_fs_backup/my_snapshot_2 - - # View all snapshots - curl http://localhost:9200/_snapshot/my_fs_backup/_all - ``` - - These commands allow you to check the status and details of your snapshots - -- **Restoring a Snapshot**: - ```shell - curl -X "POST" "http://localhost:9200/_snapshot/my_fs_backup/my_snapshot_2/_restore?wait_for_completion=true" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "include_aliases": false, - "include_global_state": false, - "ignore_unavailable": true, - "rename_replacement": "items_$1-copy", - "indices": "items_*", - "rename_pattern": "items_(.+)" - }' - ``` - - This specific command will restore any indices that match items_* and rename them so that the new index name will be suffixed with -copy - - The rename_pattern and rename_replacement parameters allow you to restore indices under new names - -- **Updating Collection References**: - ```shell - curl -X "POST" "http://localhost:9200/items_my-collection-copy/_update_by_query" \ - -H 'Content-Type: application/json; charset=utf-8' \ - -d $'{ - "query": { - "match_all": {} - }, - "script": { - "lang": "painless", - "params": { - "collection": "my-collection-copy" - }, - "source": "ctx._source.collection = params.collection" - } - }' - ``` - - After restoring, the item documents have been restored in the new index (e.g., my-collection-copy), but the value of the collection field in those documents is still the original value of my-collection - - This command updates these values to match the new collection name using Elasticsearch's Update By Query feature - -- **Creating a New Collection**: - ```shell - curl -X "POST" "http://localhost:8080/collections" \ - -H 'Content-Type: application/json' \ - -d $'{ - "id": "my-collection-copy" - }' - ``` - - The final step is to create a new collection through the API with the new name for each of the restored indices - - This gives you a copy of the collection that has a resource URI (/collections/my-collection-copy) and can be correctly queried by collection name - -### Reindexing - -- **Overview**: Reindexing allows you to copy documents from one index to another, optionally transforming them in the process. - -- **Use Cases**: - - Apply changes to documents - - Correct dynamically generated mappings - - Transform data (e.g., lowercase identifiers) - - The index templates will make sure that manually created indices will also have the correct mappings and settings - -- **Example: Reindexing with Transformation**: - ```shell - curl -X "POST" "http://localhost:9200/_reindex" \ - -H 'Content-Type: application/json' \ - -d $'{ - "source": { - "index": "items_my-collection-lower_my-collection-hex-000001" - }, - "dest": { - "index": "items_my-collection-lower_my-collection-hex-000002" - }, - "script": { - "source": "ctx._source.id = ctx._source.id.toLowerCase()", - "lang": "painless" - } - }' - ``` - - In this example, we make a copy of an existing Item index but change the Item identifier to be lowercase - - The script parameter allows you to transform documents during the reindexing process - -- **Updating Aliases**: - ```shell - curl -X "POST" "http://localhost:9200/_aliases" \ - -H 'Content-Type: application/json' \ - -d $'{ - "actions": [ - { - "remove": { - "index": "*", - "alias": "items_my-collection" - } - }, - { - "add": { - "index": "items_my-collection-lower_my-collection-hex-000002", - "alias": "items_my-collection" - } - } - ] - }' - ``` - - If you are happy with the data in the newly created index, you can move the alias items_my-collection to the new index - - This makes the modified Items with lowercase identifiers visible to users accessing my-collection in the STAC API - - Using aliases allows you to switch between different index versions without changing the API endpoint - -## Auth - -- **Overview**: Authentication is an optional feature that can be enabled through Route Dependencies. -- **Implementation Options**: - - Basic authentication - - OAuth2 with Keycloak - - Custom route dependencies -- **Configuration**: Authentication can be configured using the `STAC_FASTAPI_ROUTE_DEPENDENCIES` environment variable. -- **Examples and Documentation**: Detailed examples and implementation guides can be found in the [examples/auth](examples/auth) directory. - -## Aggregation - -- **Supported Aggregations**: - - Spatial aggregations of points and geometries - - Frequency distribution aggregation of any property including dates - - Temporal distribution of datetime values - -- **Endpoint Locations**: - - Root Catalog level: `/aggregations` - - Collection level: `//aggregations` - -- **Implementation Details**: The `sfeos_helpers.aggregation` package provides specialized functionality for both Elasticsearch and OpenSearch backends. - -- **Documentation**: Detailed information about supported aggregations can be found in [the aggregation docs](./docs/src/aggregation.md). - - -## Rate Limiting - -- **Overview**: Rate limiting is an optional security feature that controls API request frequency on a remote address basis. - -- **Configuration**: Enabled by setting the `STAC_FASTAPI_RATE_LIMIT` environment variable: - ``` - STAC_FASTAPI_RATE_LIMIT=500/minute - ``` - -- **Functionality**: - - Limits each client to a specified number of requests per time period (e.g., 500 requests per minute) - - Helps prevent API abuse and maintains system stability - - Ensures fair resource allocation among all clients - -- **Examples**: Implementation examples are available in the [examples/rate_limit](examples/rate_limit) directory. - From b2ff45b5330ccbb37429c5d0b10e12df81fa904f Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Mon, 25 Aug 2025 10:29:22 +0200 Subject: [PATCH 08/11] fix import --- stac_fastapi/core/stac_fastapi/core/core.py | 7 ++----- .../stac_fastapi/elasticsearch/database_logic.py | 7 +++++-- .../opensearch/stac_fastapi/opensearch/database_logic.py | 8 ++++++-- .../stac_fastapi/sfeos_helpers/aggregation/client.py | 3 +-- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/stac_fastapi/core/stac_fastapi/core/core.py b/stac_fastapi/core/stac_fastapi/core/core.py index 1fde5bd3..0db9aad9 100644 --- a/stac_fastapi/core/stac_fastapi/core/core.py +++ b/stac_fastapi/core/stac_fastapi/core/core.py @@ -37,7 +37,6 @@ BulkTransactionMethod, Items, ) -from stac_fastapi.sfeos_helpers.database import return_date from stac_fastapi.types import stac as stac_types from stac_fastapi.types.conformance import BASE_CONFORMANCE_CLASSES from stac_fastapi.types.core import AsyncBaseCoreClient @@ -326,9 +325,8 @@ async def item_collection( ) try: - datetime_search = return_date(datetime) search = self.database.apply_datetime_filter( - search=search, datetime_search=datetime_search + search=search, datetime=datetime ) except (ValueError, TypeError) as e: # Handle invalid interval formats if return_date fails @@ -509,9 +507,8 @@ async def post_search( ) try: - datetime_search = return_date(search_request.datetime) search = self.database.apply_datetime_filter( - search=search, datetime_search=datetime_search + search=search, datetime=search_request.datetime ) except (ValueError, TypeError) as e: # Handle invalid interval formats if return_date fails diff --git a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py index 46766386..eb3861dd 100644 --- a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py +++ b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py @@ -39,6 +39,7 @@ mk_actions, mk_item_id, populate_sort_shared, + return_date, validate_refresh, ) from stac_fastapi.sfeos_helpers.database.query import ( @@ -277,17 +278,19 @@ def apply_collections_filter(search: Search, collection_ids: List[str]): @staticmethod def apply_datetime_filter( - search: Search, datetime_search: Dict[str, Optional[str]] + search: Search, datetime: str | None ) -> Search: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - datetime_search: Dict[str, Optional[str]] + datetime: str | None Returns: The filtered search object. """ + datetime_search = return_date(datetime) + if not datetime_search: return search diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index 830694c3..8b2b0479 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -37,6 +37,7 @@ mk_actions, mk_item_id, populate_sort_shared, + return_date, validate_refresh, ) from stac_fastapi.sfeos_helpers.database.query import ( @@ -66,6 +67,7 @@ from stac_fastapi.types.links import resolve_links from stac_fastapi.types.stac import Collection, Item + logger = logging.getLogger(__name__) @@ -284,17 +286,19 @@ def apply_free_text_filter(search: Search, free_text_queries: Optional[List[str] @staticmethod def apply_datetime_filter( - search: Search, datetime_search: Dict[str, Optional[str]] + search: Search, datetime: str | None ) -> Search: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - datetime_search: Dict[str, Optional[str]] + datetime: str | None Returns: The filtered search object. """ + datetime_search = return_date(datetime) + if not datetime_search: return search diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py index 641c81f1..38d3050b 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py @@ -313,10 +313,9 @@ async def aggregate( search=search, item_ids=aggregate_request.ids ) - datetime_search = return_date(aggregate_request.datetime) if aggregate_request.datetime: search = self.database.apply_datetime_filter( - search=search, datetime_search=datetime_search + search=search, datetime=aggregate_request.datetime ) if aggregate_request.bbox: From 3ff0a9f7c267f779684a27505741ba60fc2797f5 Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Mon, 25 Aug 2025 10:47:22 +0200 Subject: [PATCH 09/11] fix --- stac_fastapi/core/stac_fastapi/core/core.py | 4 ++-- .../stac_fastapi/elasticsearch/database_logic.py | 9 ++++++--- .../stac_fastapi/opensearch/database_logic.py | 10 ++++++---- .../stac_fastapi/sfeos_helpers/aggregation/client.py | 3 +-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/stac_fastapi/core/stac_fastapi/core/core.py b/stac_fastapi/core/stac_fastapi/core/core.py index 0db9aad9..07b17890 100644 --- a/stac_fastapi/core/stac_fastapi/core/core.py +++ b/stac_fastapi/core/stac_fastapi/core/core.py @@ -325,7 +325,7 @@ async def item_collection( ) try: - search = self.database.apply_datetime_filter( + search, datetime_search = self.database.apply_datetime_filter( search=search, datetime=datetime ) except (ValueError, TypeError) as e: @@ -507,7 +507,7 @@ async def post_search( ) try: - search = self.database.apply_datetime_filter( + search, datetime_search = self.database.apply_datetime_filter( search=search, datetime=search_request.datetime ) except (ValueError, TypeError) as e: diff --git a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py index eb3861dd..1687eaf3 100644 --- a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py +++ b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py @@ -279,7 +279,7 @@ def apply_collections_filter(search: Search, collection_ids: List[str]): @staticmethod def apply_datetime_filter( search: Search, datetime: str | None - ) -> Search: + ) -> Tuple[Search, Dict[str, Optional[str]]]: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: @@ -292,7 +292,7 @@ def apply_datetime_filter( datetime_search = return_date(datetime) if not datetime_search: - return search + return search, datetime_search if "eq" in datetime_search: # For exact matches, include: @@ -359,7 +359,10 @@ def apply_datetime_filter( ), ] - return search.query(Q("bool", should=should, minimum_should_match=1)) + return ( + search.query(Q("bool", should=should, minimum_should_match=1)), + datetime_search, + ) @staticmethod def apply_bbox_filter(search: Search, bbox: List): diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index 8b2b0479..af48e8f8 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -67,7 +67,6 @@ from stac_fastapi.types.links import resolve_links from stac_fastapi.types.stac import Collection, Item - logger = logging.getLogger(__name__) @@ -287,7 +286,7 @@ def apply_free_text_filter(search: Search, free_text_queries: Optional[List[str] @staticmethod def apply_datetime_filter( search: Search, datetime: str | None - ) -> Search: + ) -> Tuple[Search, Dict[str, Optional[str]]]: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: @@ -300,7 +299,7 @@ def apply_datetime_filter( datetime_search = return_date(datetime) if not datetime_search: - return search + return search, datetime_search if "eq" in datetime_search: # For exact matches, include: @@ -367,7 +366,10 @@ def apply_datetime_filter( ), ] - return search.query(Q("bool", should=should, minimum_should_match=1)) + return ( + search.query(Q("bool", should=should, minimum_should_match=1)), + datetime_search, + ) @staticmethod def apply_bbox_filter(search: Search, bbox: List): diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py index 38d3050b..bde569b7 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py @@ -21,7 +21,6 @@ Aggregation, AggregationCollection, ) -from stac_fastapi.sfeos_helpers.database import return_date from stac_fastapi.types.rfc3339 import DateTimeType from .format import frequency_agg, metric_agg @@ -314,7 +313,7 @@ async def aggregate( ) if aggregate_request.datetime: - search = self.database.apply_datetime_filter( + search, datetime_search = self.database.apply_datetime_filter( search=search, datetime=aggregate_request.datetime ) From cdceee92f2ae53231c424b45f357bbb53ee6eec6 Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Mon, 25 Aug 2025 10:53:08 +0200 Subject: [PATCH 10/11] fix type hints --- .../stac_fastapi/elasticsearch/database_logic.py | 4 ++-- .../opensearch/stac_fastapi/opensearch/database_logic.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py index 1687eaf3..5f100980 100644 --- a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py +++ b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py @@ -278,13 +278,13 @@ def apply_collections_filter(search: Search, collection_ids: List[str]): @staticmethod def apply_datetime_filter( - search: Search, datetime: str | None + search: Search, datetime: Optional[str] ) -> Tuple[Search, Dict[str, Optional[str]]]: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - datetime: str | None + datetime: Optional[str] Returns: The filtered search object. diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index af48e8f8..4ff44ca0 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -285,13 +285,13 @@ def apply_free_text_filter(search: Search, free_text_queries: Optional[List[str] @staticmethod def apply_datetime_filter( - search: Search, datetime: str | None + search: Search, datetime: Optional[str] ) -> Tuple[Search, Dict[str, Optional[str]]]: """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search: The search object to filter. - datetime: str | None + datetime: Optional[str] Returns: The filtered search object. From 36357c3978dce4157121cf39bfccb046a02da463 Mon Sep 17 00:00:00 2001 From: Grzegorz Pustulka Date: Mon, 25 Aug 2025 11:10:46 +0200 Subject: [PATCH 11/11] fix --- .../stac_fastapi/sfeos_helpers/aggregation/client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py index bde569b7..1f77cd9e 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py @@ -316,6 +316,8 @@ async def aggregate( search, datetime_search = self.database.apply_datetime_filter( search=search, datetime=aggregate_request.datetime ) + else: + datetime_search = {"gte": None, "lte": None} if aggregate_request.bbox: bbox = aggregate_request.bbox