Skip to content

Commit 39df273

Browse files
authored
Merge pull request #832 from aaxelb/fix/valuesearch-trailing-slashes
[ENG-6576] fix: duplicate value-search values
2 parents 1db8e75 + 63740ca commit 39df273

File tree

3 files changed

+97
-48
lines changed

3 files changed

+97
-48
lines changed

share/search/index_strategy/trovesearch_denorm.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy):
5555
CURRENT_STRATEGY_CHECKSUM = ChecksumIri(
5656
checksumalgorithm_name='sha-256',
5757
salt='TrovesearchDenormIndexStrategy',
58-
hexdigest='e538bbc5966a6a289da9e5ba51ecde5ff29528bf07e940716ef8a888d6601916',
58+
hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8',
5959
)
6060

6161
# abstract method from IndexStrategy
@@ -73,7 +73,10 @@ def backfill_message_type(self):
7373

7474
# abstract method from Elastic8IndexStrategy
7575
def index_settings(self):
76-
return {}
76+
return {
77+
'number_of_shards': 5,
78+
'number_of_replicas': 2,
79+
}
7780

7881
# abstract method from Elastic8IndexStrategy
7982
def index_mappings(self):
@@ -127,7 +130,6 @@ def _card_mappings(self):
127130

128131
def _iri_value_mappings(self):
129132
return {
130-
'value_iri': ts.KEYWORD_MAPPING,
131133
'value_name': ts.KEYWORD_MAPPING,
132134
'value_title': ts.KEYWORD_MAPPING,
133135
'value_label': ts.KEYWORD_MAPPING,
@@ -137,7 +139,8 @@ def _iri_value_mappings(self):
137139

138140
def _paths_and_values_mappings(self):
139141
return {
140-
'focus_iri': ts.IRI_KEYWORD_MAPPING,
142+
'single_focus_iri': ts.KEYWORD_MAPPING,
143+
'focus_iri_synonyms': ts.KEYWORD_MAPPING,
141144
'propertypaths_present': ts.KEYWORD_MAPPING,
142145
# flattened properties (dynamic sub-properties with keyword values)
143146
'iri_by_propertypath': ts.FLATTENED_MAPPING,
@@ -311,21 +314,23 @@ def _card_subdoc(self) -> dict:
311314
def _iri_value_subdoc(self, iri: str) -> dict:
312315
_shortwalk = self._fullwalk.shortwalk_from(iri)
313316
return {
314-
'value_iri': iri,
315-
'value_iris': self._exact_and_suffuniq_iris(iri),
317+
**self._paths_and_values(_shortwalk),
316318
'value_name': list(self._texts_at_properties(_shortwalk, ts.NAME_PROPERTIES)),
317319
'value_title': list(self._texts_at_properties(_shortwalk, ts.TITLE_PROPERTIES)),
318320
'value_label': list(self._texts_at_properties(_shortwalk, ts.LABEL_PROPERTIES)),
319321
'at_card_propertypaths': [
320322
ts.propertypath_as_keyword(_path)
321323
for _path in self._fullwalk.paths_by_iri[iri]
322324
],
323-
**self._paths_and_values(_shortwalk),
324325
}
325326

326327
def _paths_and_values(self, walk: ts.GraphWalk):
327328
return {
328-
'focus_iri': self._exact_and_suffuniq_iris(walk.focus_iri),
329+
'single_focus_iri': walk.focus_iri.rstrip('/'), # remove trailing slash, but keep the scheme
330+
'focus_iri_synonyms': ts.suffuniq_iris(ts.iri_synonyms(
331+
walk.focus_iri,
332+
self.rdfdoc,
333+
)),
329334
'propertypaths_present': self._propertypaths_present(walk),
330335
'iri_by_propertypath': self._iris_by_propertypath(walk),
331336
'iri_by_depth': self._iris_by_depth(walk),
@@ -391,13 +396,6 @@ def _ints_by_propertypath(self, walk: ts.GraphWalk):
391396
for _path, _value_set in walk.integer_values.items()
392397
}
393398

394-
def _exact_and_suffuniq_iris(self, iri: str):
395-
_synonyms = ts.iri_synonyms(iri, self.rdfdoc)
396-
return {
397-
'exact': list(_synonyms),
398-
'suffuniq': ts.suffuniq_iris(_synonyms),
399-
}
400-
401399
###
402400
# normalizing search responses
403401

@@ -598,7 +596,7 @@ def _iri_filter(self, search_filter) -> dict:
598596

599597
def _path_iri_query(self, path, suffuniq_iris):
600598
if path == (OWL.sameAs,):
601-
_field = f'{self.base_field}.focus_iri.suffuniq'
599+
_field = f'{self.base_field}.focus_iri_synonyms'
602600
elif is_globpath(path):
603601
_field = f'{self.base_field}.iri_by_depth.{_depth_field_name(len(path))}'
604602
else:
@@ -801,7 +799,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d
801799
'aggs': {
802800
'agg_valuesearch_iris': {
803801
'terms': {
804-
'field': 'iri_value.value_iri',
802+
'field': 'iri_value.single_focus_iri',
805803
# WARNING: terribly hacky pagination (part one)
806804
'size': cursor.start_offset + cursor.page_size + 1,
807805
},

tests/share/search/index_strategy/_common_trovesearch_tests.py

Lines changed: 73 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import Iterable, Iterator
22
import dataclasses
33
from datetime import date, timedelta
4-
import itertools
54
import math
65
from urllib.parse import urlencode
76

@@ -56,22 +55,14 @@ def test_for_smoke_with_daemon(self):
5655

5756
def test_cardsearch(self):
5857
self._fill_test_data_for_querying()
59-
_cardsearch_cases = itertools.chain(
60-
self.cardsearch_cases(),
61-
self.cardsearch_integer_cases(),
62-
)
63-
for _queryparams, _expected_focus_iris in _cardsearch_cases:
58+
for _queryparams, _expected_focus_iris in self.cardsearch_cases():
6459
self._assert_cardsearch_iris(_queryparams, _expected_focus_iris)
6560

6661
def test_cardsearch_after_deletion(self):
6762
_cards = self._fill_test_data_for_querying()
6863
_deleted_focus_iris = {BLARG.b}
6964
self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris])
70-
_cardsearch_cases = itertools.chain(
71-
self.cardsearch_cases(),
72-
self.cardsearch_integer_cases(),
73-
)
74-
for _queryparams, _expected_focus_iris in _cardsearch_cases:
65+
for _queryparams, _expected_focus_iris in self.cardsearch_cases():
7566
if isinstance(_expected_focus_iris, set):
7667
_expected_focus_iris -= _deleted_focus_iris
7768
else:
@@ -173,17 +164,17 @@ def test_valuesearch(self):
173164

174165
def test_valuesearch_after_deletion(self):
175166
_cards = self._fill_test_data_for_querying()
176-
_deleted_focus_iris = {BLARG.b}
167+
_deleted_focus_iris = {BLARG.c}
177168
self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris])
178169
_cases = [
179170
(
180171
{'valueSearchPropertyPath': 'subject'},
181-
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c}, # BLARG.subj_b no longer present
172+
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_b}, # BLARG.subj_c no longer present
182173
), (
183174
{'valueSearchPropertyPath': 'dateCreated'},
184-
{'1999', '2024'}, # 2012 no longer present
175+
{'1999', '2012'}, # 2024 no longer present
185176
), (
186-
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'bbbb'},
177+
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'cccc'},
187178
set(), # none
188179
)
189180
]
@@ -296,7 +287,11 @@ def _fill_test_data_for_querying(self):
296287
DCTERMS.created: {rdf.literal(date(2024, 12, 31))},
297288
DCTERMS.creator: {BLARG.someone_else},
298289
DCTERMS.title: {rdf.literal('cccc')},
299-
DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c},
290+
DCTERMS.subject: {
291+
BLARG['subj_ac/'], # this one has an extra trailing slash
292+
BLARG.subj_bc,
293+
BLARG.subj_c,
294+
},
300295
DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')},
301296
},
302297
BLARG.someone_else: {
@@ -401,14 +396,6 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str
401396
{'cardSearchFilter[dcterms:replaces][is-absent]': ''},
402397
set(),
403398
)
404-
yield (
405-
{'cardSearchFilter[subject]': BLARG.subj_ac},
406-
{BLARG.c, BLARG.a},
407-
)
408-
yield (
409-
{'cardSearchFilter[subject][none-of]': BLARG.subj_ac},
410-
{BLARG.b},
411-
)
412399
yield (
413400
{
414401
'cardSearchFilter[subject]': BLARG.subj_bc,
@@ -510,6 +497,8 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str
510497
{'cardSearchText': '"what is here"'},
511498
{BLARG.b},
512499
)
500+
yield from self.cardsearch_trailingslash_cases()
501+
yield from self.cardsearch_integer_cases()
513502

514503
def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]:
515504
# cases that depend on integer values getting indexed
@@ -522,6 +511,17 @@ def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] |
522511
[BLARG.c, BLARG.a, BLARG.b], # ordered list
523512
)
524513

514+
def cardsearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]:
515+
# cases that depend on trailing-slash ignorance
516+
yield (
517+
{'cardSearchFilter[subject]': BLARG.subj_ac},
518+
{BLARG.c, BLARG.a},
519+
)
520+
yield (
521+
{'cardSearchFilter[subject][none-of]': BLARG.subj_ac},
522+
{BLARG.b},
523+
)
524+
525525
def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
526526
yield (
527527
{'valueSearchPropertyPath': 'references'},
@@ -531,6 +531,26 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
531531
{'valueSearchPropertyPath': 'dateCreated'},
532532
{'1999', '2012', '2024'},
533533
)
534+
yield (
535+
{
536+
'valueSearchPropertyPath': 'references',
537+
'valueSearchFilter[resourceType]': BLARG.Thing,
538+
},
539+
{BLARG.b, BLARG.c},
540+
)
541+
yield (
542+
{
543+
'valueSearchPropertyPath': 'references',
544+
'valueSearchText': 'bbbb',
545+
},
546+
{BLARG.b},
547+
)
548+
yield from self.valuesearch_trailingslash_cases()
549+
yield from self.valuesearch_sameas_cases()
550+
# TODO: more
551+
552+
def valuesearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
553+
# cases that depend on trailing-slash ignorance
534554
yield (
535555
{'valueSearchPropertyPath': 'subject'},
536556
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_b, BLARG.subj_bc, BLARG.subj_c},
@@ -547,21 +567,43 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
547567
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'aaaa'},
548568
{BLARG.subj_ac, BLARG.subj_a},
549569
)
570+
571+
def valuesearch_sameas_cases(self):
550572
yield (
551573
{
552-
'valueSearchPropertyPath': 'references',
553-
'valueSearchFilter[resourceType]': BLARG.Thing,
574+
'valueSearchPropertyPath': 'subject',
575+
'cardSearchFilter[sameAs]': BLARG.a,
554576
},
555-
{BLARG.b, BLARG.c},
577+
{BLARG.subj_ac, BLARG.subj_a},
556578
)
557579
yield (
558580
{
559-
'valueSearchPropertyPath': 'references',
560-
'valueSearchText': 'bbbb',
581+
'valueSearchPropertyPath': 'subject',
582+
'cardSearchFilter[sameAs]': BLARG.a_same,
561583
},
562-
{BLARG.b},
584+
{BLARG.subj_ac, BLARG.subj_a},
585+
)
586+
yield (
587+
{
588+
'valueSearchPropertyPath': 'subject',
589+
'cardSearchFilter[sameAs]': BLARG.b_same,
590+
},
591+
{BLARG.subj_b, BLARG.subj_bc},
592+
)
593+
yield (
594+
{
595+
'valueSearchPropertyPath': 'subject',
596+
'valueSearchFilter[sameAs]': BLARG.subj_a,
597+
},
598+
{BLARG.subj_a},
599+
)
600+
yield (
601+
{
602+
'valueSearchPropertyPath': 'subject',
603+
'cardSearchFilter[subject]': BLARG.subj_ac,
604+
},
605+
{BLARG.subj_ac, BLARG.subj_a, BLARG.subj_c, BLARG.subj_bc},
563606
)
564-
# TODO: more
565607

566608
def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]):
567609
_messages_chunk = messages.MessagesChunk(

tests/share/search/index_strategy/test_trove_indexcard_flats.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,12 @@ def get_index_strategy(self):
1010

1111
def cardsearch_integer_cases(self):
1212
yield from () # integers not indexed by this strategy
13+
14+
def cardsearch_trailingslash_cases(self):
15+
yield from () # trailing-slash handling improved in trovesearch_denorm
16+
17+
def valuesearch_sameas_cases(self):
18+
yield from () # sameas handling improved in trovesearch_denorm
19+
20+
def valuesearch_trailingslash_cases(self):
21+
yield from () # trailing-slash handling improved in trovesearch_denorm

0 commit comments

Comments
 (0)