⚡️ (query/matching) Make weak signals opt-in

simonwoerpel · simonwoerpel · commit d84ca6c1fadc · 2026-03-09T08:56:58.000+01:00
diff --git a/docs/matching.md b/docs/matching.md
@@ -23,6 +23,25 @@ The index stores multiple name representations to catch variations:
 - Name parts (partial matching) (`name_parts`)
 
 
+## Configuration
+
+Matching stages 1 (normalized keywords) and 2 (name keys) are always enabled. Stages 3-5 can be toggled via environment variables:
+
+| Setting | Default | Stage |
+|---------|---------|-------|
+| `OPENALEPH_SEARCH_MATCH_NAME_PARTS` | `false` | Name parts (partial token overlap) |
+| `OPENALEPH_SEARCH_MATCH_PHONETIC` | `false` | Phonetic encoding (sound-alike) |
+| `OPENALEPH_SEARCH_MATCH_SYMBOLS` | `false` | Name symbols (cross-language) |
+
+Enabling more stages improves recall (finding more potential matches) at the cost of query complexity and performance. For most use cases, stages 1 and 2 provide sufficient matching quality.
+
+```bash
+# Enable all matching stages
+export OPENALEPH_SEARCH_MATCH_NAME_PARTS=true
+export OPENALEPH_SEARCH_MATCH_PHONETIC=true
+export OPENALEPH_SEARCH_MATCH_SYMBOLS=true
+```
+
 ## Name matching strategies
 
 ### 1. Normalized keywords
@@ -42,7 +61,10 @@ Normalization:
 
 Exact name matches (with order preserved) receive the highest boost.
 
-### 2. Name symbols
+### 2. Name symbols {: #name-symbols }
+
+!!! note
+    Disabled by default. Enable with `OPENALEPH_SEARCH_MATCH_SYMBOLS=true`.
 
 Cross-language and cross-alphabet matching via symbolic representations. This can be considered as a synonyms search, but more precise and context specific than [a global synonyms file](https://www.elastic.co/docs/solutions/search/full-text/search-with-synonyms).
 
@@ -58,7 +80,10 @@ Example:
 
 Same symbol = same entity name (part) across languages.
 
-### 3. Phonetic encoding
+### 3. Phonetic encoding {: #phonetic }
+
+!!! note
+    Disabled by default. Enable with `OPENALEPH_SEARCH_MATCH_PHONETIC=true`.
 
 Sound-alike matching using Double Metaphone algorithm.
 
@@ -72,7 +97,10 @@ Example:
 
 Catches alternate spellings and transcription variations.
 
-### 4. Name parts
+### 4. Name parts {: #name-parts }
+
+!!! note
+    Disabled by default. Enable with `OPENALEPH_SEARCH_MATCH_NAME_PARTS=true`.
 
 Individual name components for partial matching.
 
@@ -143,16 +171,16 @@ Only compatible schema types can match each other.
 
 Match scores combine multiple factors:
 
-| Signal | Boost | Index field |
-|--------|-------|-------------|
-| Names (exact, order preserved) | 5.0 | `names` |
-| Name keys (order-independent) | 3.0 | `name_keys` |
-| Identifiers | 3.0 | `properties.*` (for group type "identifier") |
-| High-value properties | 2.0 | `properties.*` (ip, url, email, phone) |
-| Name parts | 1.0 | `name_parts` |
-| Other properties | 1.0 | `properties.*` |
-| Phonetic codes | 0.8 | `name_phonetics` |
-| Name symbols | 0.8 | `name_symbols` |
+| Signal | Boost | Index field | Default |
+|--------|-------|-------------|---------|
+| Names (exact, order preserved) | 5.0 | `names` | always |
+| Name keys (order-independent) | 3.0 | `name_keys` | always |
+| Identifiers | 3.0 | `properties.*` (for group type "identifier") | always |
+| High-value properties | 2.0 | `properties.*` (ip, url, email, phone) | always |
+| Name parts | 1.0 | `name_parts` | opt-in |
+| Other properties | 1.0 | `properties.*` | always |
+| Phonetic codes | 0.8 | `name_phonetics` | opt-in |
+| Name symbols | 0.8 | `name_symbols` | opt-in |
 
 Higher boost = more important for matching.
 
@@ -192,9 +220,10 @@ A match query combines multiple strategies:
             // Name matching clauses (using terms queries for efficiency)
             {"terms": {"names": ["john smith"], "boost": 5.0}},
             {"terms": {"name_keys": ["johnsmith"], "boost": 3.0}},
-            {"terms_set": {"name_parts": {"terms": ["john", "smith"], "minimum_should_match_script": {...}}}},
-            {"terms_set": {"name_phonetic": {"terms": ["JN", "SM0"], "minimum_should_match_script": {...}}}},
-            {"terms_set": {"name_symbols": {"terms": ["[NAME:12345]"], "minimum_should_match_script": {...}}}}
+            // Optional stages (disabled by default, enable via settings):
+            {"terms_set": {"name_parts": {"terms": ["john", "smith"], "minimum_should_match_script": {...}}}},   // match_name_parts
+            {"terms_set": {"name_phonetic": {"terms": ["JN", "SM0"], "minimum_should_match_script": {...}}}},    // match_phonetic
+            {"terms_set": {"name_symbols": {"terms": ["[NAME:12345]"], "minimum_should_match_script": {...}}}}   // match_symbols
           ],
           "minimum_should_match": 1
         }
diff --git a/docs/reference/settings.md b/docs/reference/settings.md
@@ -393,6 +393,38 @@ Maximum document frequency for MLT query terms. Common terms above this threshol
 - Type: `int`
 - Default: `500`
 
+## Entity matching
+
+[Read more](../matching.md)
+
+### `match_name_parts`
+
+Enable name parts matching (partial token overlap, requires 2+ matching tokens).
+
+- Type: `bool`
+- Default: `false`
+
+### `match_phonetic`
+
+Enable phonetic matching (sound-alike via Double Metaphone).
+
+- Type: `bool`
+- Default: `false`
+
+### `match_symbols`
+
+Enable name symbols matching (cross-language/alphabet via WikiData).
+
+- Type: `bool`
+- Default: `false`
+
+```bash
+# Enable all optional matching stages
+export OPENALEPH_SEARCH_MATCH_NAME_PARTS=true
+export OPENALEPH_SEARCH_MATCH_PHONETIC=true
+export OPENALEPH_SEARCH_MATCH_SYMBOLS=true
+```
+
 ## Authorization
 
 [Read more](./authorization.md)
diff --git a/openaleph_search/query/matching.py b/openaleph_search/query/matching.py
@@ -8,6 +8,7 @@
 
 from openaleph_search.index.mapping import Field, property_field_name
 from openaleph_search.query.util import BoolQuery, bool_query, none_query
+from openaleph_search.settings import Settings
 from openaleph_search.transform.util import (
     index_name_keys,
     index_name_parts,
@@ -93,50 +94,55 @@ def names_query(schema: Schema, names: list[str]) -> Clauses:
     if keys:
         shoulds.append({"terms": {Field.NAME_KEYS: keys, "boost": 3.0}})
 
+    settings = Settings()
+
     # 3. name_parts: partial token overlap (requires 2+ matching tokens)
-    parts = list(index_name_parts(schema, names))
-    if parts:
-        shoulds.append(
-            {
-                "terms_set": {
-                    Field.NAME_PARTS: {
-                        "terms": parts,
-                        "minimum_should_match_script": _min_should_match_script(2),
-                        "boost": 1.0,
+    if settings.match_name_parts:
+        parts = list(index_name_parts(schema, names))
+        if parts:
+            shoulds.append(
+                {
+                    "terms_set": {
+                        Field.NAME_PARTS: {
+                            "terms": parts,
+                            "minimum_should_match_script": _min_should_match_script(2),
+                            "boost": 1.0,
+                        }
                     }
                 }
-            }
-        )
+            )
 
     # 4. name_phonetic: spelling/transliteration variants
-    phonetics = list(phonetic_names(schema, names))
-    if phonetics:
-        shoulds.append(
-            {
-                "terms_set": {
-                    Field.NAME_PHONETIC: {
-                        "terms": phonetics,
-                        "minimum_should_match_script": _min_should_match_script(2),
-                        "boost": 0.8,
+    if settings.match_phonetic:
+        phonetics = list(phonetic_names(schema, names))
+        if phonetics:
+            shoulds.append(
+                {
+                    "terms_set": {
+                        Field.NAME_PHONETIC: {
+                            "terms": phonetics,
+                            "minimum_should_match_script": _min_should_match_script(2),
+                            "boost": 0.8,
+                        }
                     }
                 }
-            }
-        )
+            )
 
     # 5. name_symbols: synonyms, nicknames, company suffixes
-    symbols = [str(s) for s in get_name_symbols(schema, *names)]
-    if symbols:
-        shoulds.append(
-            {
-                "terms_set": {
-                    Field.NAME_SYMBOLS: {
-                        "terms": symbols,
-                        "minimum_should_match_script": _min_should_match_script(2),
-                        "boost": 0.8,
+    if settings.match_symbols:
+        symbols = [str(s) for s in get_name_symbols(schema, *names)]
+        if symbols:
+            shoulds.append(
+                {
+                    "terms_set": {
+                        Field.NAME_SYMBOLS: {
+                            "terms": symbols,
+                            "minimum_should_match_script": _min_should_match_script(2),
+                            "boost": 0.8,
+                        }
                     }
                 }
-            }
-        )
+            )
 
     return shoulds
 
diff --git a/openaleph_search/settings.py b/openaleph_search/settings.py
@@ -98,6 +98,12 @@ class Settings(BaseSettings):
     mlt_min_word_length: int = 5
     mlt_max_doc_freq: int = 500
 
+    # Entity matching stages (names_query in query/matching.py)
+    # Stages 1 (names) and 2 (name_keys) are always enabled.
+    match_name_parts: bool = False
+    match_phonetic: bool = False
+    match_symbols: bool = False
+
     # Pre-build global ordinals on frequently-aggregated keyword fields
     # during refresh. Eliminates first-query latency spikes at the cost of
     # slightly slower refreshes.
diff --git a/pyproject.toml b/pyproject.toml
@@ -70,3 +70,6 @@ OPENALEPH_SEARCH_SIGNIFICANT_TERMS_RANDOM_SAMPLER = 0
 OPENALEPH_SEARCH_MLT_MIN_WORD_LENGTH = 3
 OPENALEPH_SEARCH_MLT_MIN_DOC_FREQ = 1
 OPENALEPH_SEARCH_MLT_MIN_TERM_FREQ = 1
+OPENALEPH_SEARCH_MATCH_NAME_PARTS = 1
+OPENALEPH_SEARCH_MATCH_PHONETIC = 1
+OPENALEPH_SEARCH_MATCH_SYMBOLS = 1