Skip to content

Commit ed36037

Browse files
committed
⚡️ Use eager_global_ordinals and improve empty search
1 parent 750b888 commit ed36037

File tree

4 files changed

+46
-7
lines changed

4 files changed

+46
-7
lines changed

openaleph_search/index/mapping.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -203,16 +203,17 @@ def base_mapping() -> dict[str, MappingProperty]:
203203
204204
Returns fresh dicts on each call so callers can safely mutate the result.
205205
"""
206+
ego = {"eager_global_ordinals": True} if settings.eager_global_ordinals else {}
206207
return {
207-
Field.DATASET: {**FieldType.KEYWORD},
208-
Field.SCHEMA: {**FieldType.KEYWORD},
209-
Field.SCHEMATA: {**FieldType.KEYWORD},
208+
Field.DATASET: {**FieldType.KEYWORD, **ego},
209+
Field.SCHEMA: {**FieldType.KEYWORD, **ego},
210+
Field.SCHEMATA: {**FieldType.KEYWORD, **ego},
210211
# for fast label display
211212
Field.CAPTION: {**FieldType.KEYWORD},
212213
# original names as matching (text) field
213214
Field.NAME: {**FieldType.NAME},
214215
# names keywords, a bit normalized
215-
Field.NAMES: {**FieldType.NAME_KEYWORD},
216+
Field.NAMES: {**FieldType.NAME_KEYWORD, **ego},
216217
# name normalizations for filters and matching
217218
Field.NAME_KEYS: {**FieldType.KEYWORD},
218219
Field.NAME_PARTS: {**FieldType.KEYWORD_COPY},
@@ -227,7 +228,7 @@ def base_mapping() -> dict[str, MappingProperty]:
227228
Field.TEXT: {**FieldType.TEXT},
228229
Field.TRANSLATION: {**FieldType.TEXT},
229230
# tagging
230-
Field.TAGS: {**FieldType.KEYWORD},
231+
Field.TAGS: {**FieldType.KEYWORD, **ego},
231232
# processing metadata
232233
Field.UPDATED_AT: {**FieldType.DATE},
233234
Field.CREATED_AT: {**FieldType.DATE},
@@ -239,7 +240,7 @@ def base_mapping() -> dict[str, MappingProperty]:
239240
# OpenAleph leaked context data probably deprecated soon
240241
Field.ROLE: {**FieldType.KEYWORD},
241242
Field.PROFILE: {**FieldType.KEYWORD},
242-
Field.COLLECTION_ID: {**FieldType.KEYWORD},
243+
Field.COLLECTION_ID: {**FieldType.KEYWORD, **ego},
243244
Field.MUTABLE: {**FieldType.BOOL},
244245
# length normalization
245246
Field.NUM_VALUES: {**FieldType.INTEGER},

openaleph_search/query/base.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
log = get_logger(__name__)
2929
settings = Settings()
3030

31+
_FACET_SAMPLER_KEY = "facets.sampled"
32+
3133

3234
class Query:
3335
TEXT_FIELDS: ClassVar[list[str]] = [Field.TEXT]
@@ -205,6 +207,17 @@ def get_aggregations(self) -> dict[str, Any]:
205207
else:
206208
aggregations.update(facet_aggregations)
207209

210+
# For empty queries on large indexes, wrap facet aggregations in a
211+
# sampler so ES only aggregates a subset of docs per shard. Gives
212+
# approximate counts but is dramatically faster.
213+
if self.is_empty_query and aggregations:
214+
aggregations = {
215+
_FACET_SAMPLER_KEY: {
216+
"sampler": {"shard_size": settings.facet_sampler_size},
217+
"aggs": aggregations,
218+
}
219+
}
220+
208221
# Significant terms aggregations
209222
for facet_name in self.parser.facet_significant_names:
210223
facet_aggregations = {}
@@ -321,6 +334,8 @@ def get_significant_text_sampler(self) -> dict[str, Any]:
321334

322335
def get_sort(self) -> list[str | dict[str, dict[str, Any]]]:
323336
"""Pick one of a set of named result orderings."""
337+
if self.is_empty_query and not len(self.parser.sorts):
338+
return ["_doc"]
324339
if not len(self.parser.sorts):
325340
return self.SORT_DEFAULT
326341

@@ -466,4 +481,13 @@ def search(self) -> ObjectApiResponse:
466481
took=result.get("took"),
467482
hits=result.get("hits", {}).get("total", {}).get("value"),
468483
)
484+
485+
# Unwrap sampled facet aggregations so consumers see the same
486+
# response structure regardless of whether sampling was used.
487+
aggs = result.get("aggregations", {})
488+
if _FACET_SAMPLER_KEY in aggs:
489+
sampled = aggs.pop(_FACET_SAMPLER_KEY)
490+
sampled.pop("doc_count", None)
491+
aggs.update(sampled)
492+
469493
return result

openaleph_search/query/queries.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def get_index(self):
7979

8080
def get_query(self) -> dict[str, Any]:
8181
query = self.get_inner_query()
82-
if settings.query_function_score:
82+
if settings.query_function_score and not self.is_empty_query:
8383
return self.wrap_query_function_score(query)
8484
return query
8585

@@ -262,6 +262,11 @@ def get_index(self):
262262
schemata = list(self.entity.schema.matchable_schemata)
263263
return entities_read_index(schema=schemata)
264264

265+
def get_sort(self) -> list[str | dict[str, dict[str, Any]]]:
266+
# Always sort by score — the match query builds scoring clauses
267+
# even though the parser has no user text (is_empty_query=True).
268+
return ["_score"]
269+
265270
def get_inner_query(self) -> dict[str, Any]:
266271
query = match_query(
267272
self.entity,

openaleph_search/settings.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class Settings(BaseSettings):
6161
index_boost_documents: int = 1
6262
index_boost_pages: int = 1
6363

64+
# Sampler shard_size for facet aggregations on empty queries.
65+
# Uses approximate counts for faster response on large indexes.
66+
facet_sampler_size: int = 5000
67+
6468
# Sampler for significant_terms / significant_text aggregations
6569
significant_terms_sampler_size: int = 2000
6670
significant_text_sampler_size: int = 200
@@ -96,5 +100,10 @@ class Settings(BaseSettings):
96100
mlt_min_word_length: int = 5
97101
mlt_max_doc_freq: int = 500
98102

103+
# Pre-build global ordinals on frequently-aggregated keyword fields
104+
# during refresh. Eliminates first-query latency spikes at the cost of
105+
# slightly slower refreshes.
106+
eager_global_ordinals: bool = True
107+
99108
# search control
100109
allow_leading_wildcard: bool = False

0 commit comments

Comments
 (0)