Skip to content

Commit 2931e16

Browse files
committed
🔧 Make MLT tweaks configurable
1 parent d585abc commit 2931e16

File tree

5 files changed

+69
-22
lines changed

5 files changed

+69
-22
lines changed

openaleph_search/parse/parser.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -298,19 +298,34 @@ def get_facet_significant_text_shard_size(self) -> int:
298298

299299
def get_mlt_min_doc_freq(self) -> int:
300300
"""Minimum document frequency for more_like_this query terms."""
301-
return self.getint("mlt_min_doc_freq", 1) or 1
301+
return self.getint("mlt_min_doc_freq", self.settings.mlt_min_doc_freq) or 1
302302

303303
def get_mlt_minimum_should_match(self) -> str:
304304
"""Minimum should match percentage for more_like_this query."""
305-
return self.get("mlt_minimum_should_match", "10%") or "10%"
305+
return (
306+
self.get("mlt_minimum_should_match", self.settings.mlt_minimum_should_match)
307+
or "10%"
308+
)
306309

307310
def get_mlt_min_term_freq(self) -> int:
308311
"""Minimum term frequency for more_like_this query terms."""
309-
return self.getint("mlt_min_term_freq", 1) or 1
312+
return self.getint("mlt_min_term_freq", self.settings.mlt_min_term_freq) or 1
310313

311314
def get_mlt_max_query_terms(self) -> int:
312315
"""Maximum number of query terms for more_like_this query."""
313-
return self.getint("mlt_max_query_terms", 200) or 200
316+
return (
317+
self.getint("mlt_max_query_terms", self.settings.mlt_max_query_terms) or 200
318+
)
319+
320+
def get_mlt_min_word_length(self) -> int:
321+
"""Minimum word length for more_like_this query terms."""
322+
return (
323+
self.getint("mlt_min_word_length", self.settings.mlt_min_word_length) or 5
324+
)
325+
326+
def get_mlt_max_doc_freq(self) -> int:
327+
"""Maximum document frequency for more_like_this query terms."""
328+
return self.getint("mlt_max_doc_freq", self.settings.mlt_max_doc_freq) or 500
314329

315330
def to_dict(self) -> dict[str, Any]:
316331
parser = super().to_dict()

openaleph_search/query/more_like_this.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44

55
from openaleph_search.index.mapping import Field
66
from openaleph_search.query.util import BoolQuery, bool_query, none_query
7+
from openaleph_search.settings import Settings
78

89
log = logging.getLogger(__name__)
10+
settings = Settings()
911

1012

1113
def more_like_this_query(
@@ -38,17 +40,21 @@ def more_like_this_query(
3840
elif datasets:
3941
query["bool"]["filter"].append({"terms": {"dataset": datasets}})
4042

41-
# Get configurable parameters from parser, with sensible defaults
42-
min_doc_freq = 1
43-
minimum_should_match = "10%"
44-
min_term_freq = 1
45-
max_query_terms = 200
43+
# Get configurable parameters from parser, falling back to settings defaults
44+
min_doc_freq = settings.mlt_min_doc_freq
45+
minimum_should_match = settings.mlt_minimum_should_match
46+
min_term_freq = settings.mlt_min_term_freq
47+
max_query_terms = settings.mlt_max_query_terms
48+
min_word_length = settings.mlt_min_word_length
49+
max_doc_freq = settings.mlt_max_doc_freq
4650

4751
if parser is not None:
4852
min_doc_freq = parser.get_mlt_min_doc_freq()
4953
minimum_should_match = parser.get_mlt_minimum_should_match()
5054
min_term_freq = parser.get_mlt_min_term_freq()
5155
max_query_terms = parser.get_mlt_max_query_terms()
56+
min_word_length = parser.get_mlt_min_word_length()
57+
max_doc_freq = parser.get_mlt_max_doc_freq()
5258

5359
# Build the more_like_this query using document ID
5460
mlt_query = {
@@ -59,9 +65,8 @@ def more_like_this_query(
5965
"max_query_terms": max_query_terms,
6066
"min_doc_freq": min_doc_freq,
6167
"minimum_should_match": minimum_should_match,
62-
# min_word_length filters out short stopwords (of, the, in, ...)
63-
"min_word_length": 5,
64-
"max_doc_freq": 500, # filter out very common terms
68+
"min_word_length": min_word_length,
69+
"max_doc_freq": max_doc_freq,
6570
"boost_terms": 1,
6671
}
6772
}

openaleph_search/settings.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,5 +82,13 @@ class Settings(BaseSettings):
8282
highlighter_no_match_size: int = 300
8383
highlighter_max_analyzed_offset: int = 999999
8484

85+
# More Like This defaults
86+
mlt_min_doc_freq: int = 1
87+
mlt_min_term_freq: int = 1
88+
mlt_max_query_terms: int = 200
89+
mlt_minimum_should_match: str = "10%"
90+
mlt_min_word_length: int = 5
91+
mlt_max_doc_freq: int = 500
92+
8593
# search control
8694
allow_leading_wildcard: bool = False

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,6 @@ build-backend = "poetry.core.masonry.api"
6565
[tool.pytest_env]
6666
DEBUG = 1
6767
OPENALEPH_SEARCH_INDEX_NAMESPACE_IDS = 0
68+
OPENALEPH_SEARCH_MLT_MIN_WORD_LENGTH = 3
69+
OPENALEPH_SEARCH_MLT_MIN_DOC_FREQ = 1
70+
OPENALEPH_SEARCH_MLT_MIN_TERM_FREQ = 1

tests/test_more_like_this.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,16 @@ def _create_mlt_query(
119119

120120
@pytest.fixture(scope="function")
121121
def index_test_documents(cleanup_after):
122-
"""Index test documents for more_like_this testing"""
122+
"""Index test documents for more_like_this testing.
123+
124+
Recreates indexes from scratch to eliminate stale segment metadata from
125+
previous tests that can corrupt MLT term statistics (deleted docs remain
126+
in Lucene segments and skew IDF calculations).
127+
"""
128+
from openaleph_search.index.admin import delete_index, upgrade_search
129+
130+
delete_index()
131+
upgrade_search()
123132
entities = [make_entity(doc) for doc in TEST_DOCUMENTS]
124133
index_bulk("test_mlt", entities, sync=True)
125134
return entities
@@ -148,11 +157,16 @@ def test_more_like_this_query_function():
148157
assert "like" in mlt_query
149158
assert mlt_query["like"] == [{"_id": "doc1"}]
150159

151-
# Check default parameters (hardcoded in more_like_this.py when no parser)
152-
assert mlt_query["min_doc_freq"] == 1
153-
assert mlt_query["minimum_should_match"] == "10%"
154-
assert mlt_query["min_term_freq"] == 1
155-
assert mlt_query["max_query_terms"] == 200
160+
# Check default parameters (from settings, no parser)
161+
from openaleph_search.settings import Settings
162+
163+
s = Settings()
164+
assert mlt_query["min_doc_freq"] == s.mlt_min_doc_freq
165+
assert mlt_query["minimum_should_match"] == s.mlt_minimum_should_match
166+
assert mlt_query["min_term_freq"] == s.mlt_min_term_freq
167+
assert mlt_query["max_query_terms"] == s.mlt_max_query_terms
168+
assert mlt_query["min_word_length"] == s.mlt_min_word_length
169+
assert mlt_query["max_doc_freq"] == s.mlt_max_doc_freq
156170

157171
# Test with custom parser
158172
parser = SearchQueryParser(
@@ -367,10 +381,12 @@ def test_more_like_this_configurable_parameters():
367381
break
368382

369383
assert mlt_query_default is not None
370-
assert mlt_query_default["min_doc_freq"] == 1 # parser default
371-
assert mlt_query_default["minimum_should_match"] == "10%" # parser default
372-
assert mlt_query_default["min_term_freq"] == 1 # parser default
373-
assert mlt_query_default["max_query_terms"] == 200 # parser default
384+
assert mlt_query_default["min_doc_freq"] == 1 # settings default
385+
assert mlt_query_default["minimum_should_match"] == "10%" # settings default
386+
assert mlt_query_default["min_term_freq"] == 1 # settings default
387+
assert mlt_query_default["max_query_terms"] == 200 # settings default
388+
assert mlt_query_default["min_word_length"] == 3 # settings test override
389+
assert mlt_query_default["max_doc_freq"] == 500 # settings default
374390

375391

376392
def test_more_like_this_bucket_filtering():

0 commit comments

Comments
 (0)