Skip to content

Commit 317b7dc

Browse files
authored
Merge pull request #836 from aaxelb/fix/indexer-fallback-failure
[ENG-6654] fixfix: correct misunderstanding, handle conflicts
2 parents c85fb0d + 0880650 commit 317b7dc

File tree

8 files changed

+147
-67
lines changed

8 files changed

+147
-67
lines changed

docker-compose.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,9 @@ services:
184184

185185
worker:
186186
image: quay.io/centerforopenscience/share:develop
187-
command: /usr/local/bin/celery --app project worker --uid daemon -l INFO
187+
command:
188+
chown -R daemon:daemon /elastic8_certs/ &&
189+
/usr/local/bin/celery --app project worker --uid daemon -l INFO
188190
depends_on:
189191
- postgres
190192
- rabbitmq

project/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ def split(string, delim):
314314
'TIMEOUT': int(os.environ.get('ELASTICSEARCH_TIMEOUT', '45')),
315315
'CHUNK_SIZE': int(os.environ.get('ELASTICSEARCH_CHUNK_SIZE', 2000)),
316316
'MAX_RETRIES': int(os.environ.get('ELASTICSEARCH_MAX_RETRIES', 7)),
317+
'POST_INDEX_DELAY': int(os.environ.get('ELASTICSEARCH_POST_INDEX_DELAY', 3)),
317318
}
318319
ELASTICSEARCH5_URL = (
319320
os.environ.get('ELASTICSEARCH5_URL')

share/search/daemon.py

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import threading
99
import time
1010

11-
import amqp.exceptions
1211
from django.conf import settings
1312
import kombu
1413
from kombu.mixins import ConsumerMixin
@@ -61,8 +60,6 @@ def start_daemonthreads_for_strategy(self, index_strategy):
6160
index_strategy=index_strategy,
6261
message_callback=_daemon.on_message,
6362
)
64-
# give the daemon a more robust callback for ack-ing
65-
_daemon.ack_callback = _consumer.ensure_ack
6663
# spin up daemonthreads, ready for messages
6764
self._daemonthreads.extend(_daemon.start())
6865
# start a thread to consume messages from this strategy's queues
@@ -82,7 +79,7 @@ def stop_daemonthreads(self, *, wait=False):
8279

8380

8481
class KombuMessageConsumer(ConsumerMixin):
85-
PREFETCH_COUNT = 7500
82+
PREFETCH_COUNT = settings.ELASTICSEARCH['CHUNK_SIZE']
8683

8784
should_stop: bool # (from ConsumerMixin)
8885

@@ -130,28 +127,9 @@ def consume(self, *args, **kwargs):
130127
consume = self.connection.ensure(self.connection, super().consume)
131128
return consume(*args, **kwargs)
132129

133-
def ensure_ack(self, daemon_message: messages.DaemonMessage):
134-
# if the connection the message came thru is no longer usable,
135-
# use `kombu.Connection.autoretry` to revive it for an ack
136-
try:
137-
daemon_message.ack()
138-
except (ConnectionError, amqp.exceptions.ConnectionError):
139-
@self.connection.autoretry
140-
def _do_ack(*, channel):
141-
try:
142-
channel.basic_ack(daemon_message.kombu_message.delivery_tag)
143-
finally:
144-
channel.close()
145-
_do_ack()
146-
147-
148-
def _default_ack_callback(daemon_message: messages.DaemonMessage) -> None:
149-
daemon_message.ack()
150-
151130

152131
class IndexerDaemon:
153-
MAX_LOCAL_QUEUE_SIZE = 5000
154-
ack_callback: Callable[[messages.DaemonMessage], None]
132+
MAX_LOCAL_QUEUE_SIZE = settings.ELASTICSEARCH['CHUNK_SIZE']
155133

156134
def __init__(self, index_strategy, *, stop_event=None, daemonthread_context=None):
157135
self.stop_event = (
@@ -163,7 +141,6 @@ def __init__(self, index_strategy, *, stop_event=None, daemonthread_context=None
163141
self.__daemonthread_context = daemonthread_context or contextlib.nullcontext
164142
self.__local_message_queues = {}
165143
self.__started = False
166-
self.ack_callback = _default_ack_callback
167144

168145
def start(self) -> list[threading.Thread]:
169146
if self.__started:
@@ -192,7 +169,6 @@ def start_typed_loop_and_queue(self, message_type) -> threading.Thread:
192169
local_message_queue=_queue_from_rabbit_to_daemon,
193170
log_prefix=f'{repr(self)} MessageHandlingLoop: ',
194171
daemonthread_context=self.__daemonthread_context,
195-
ack_callback=self.ack_callback,
196172
)
197173
return _handling_loop.start_thread()
198174

@@ -226,7 +202,6 @@ class MessageHandlingLoop:
226202
local_message_queue: queue.Queue
227203
log_prefix: str
228204
daemonthread_context: Callable[[], contextlib.AbstractContextManager]
229-
ack_callback: Callable[[messages.DaemonMessage], None]
230205
_leftover_daemon_messages_by_target_id = None
231206

232207
def __post_init__(self):
@@ -310,7 +285,7 @@ def _handle_some_messages(self):
310285
sentry_sdk.capture_message('error handling message', extras={'message_response': message_response})
311286
target_id = message_response.index_message.target_id
312287
for daemon_message in daemon_messages_by_target_id.pop(target_id, ()):
313-
self.ack_callback(daemon_message)
288+
daemon_message.ack() # finally set it free
314289
if daemon_messages_by_target_id: # should be empty by now
315290
logger.error('%sUnhandled messages?? %s', self.log_prefix, len(daemon_messages_by_target_id))
316291
sentry_sdk.capture_message(

share/search/index_strategy/elastic8.py

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,11 @@ def index_mappings(self):
6060
raise NotImplementedError
6161

6262
@abc.abstractmethod
63-
def build_elastic_actions(self, messages_chunk: messages.MessagesChunk) -> typing.Iterable[tuple[int, dict]]:
64-
# yield (message_target_id, elastic_action) pairs
63+
def build_elastic_actions(
64+
self,
65+
messages_chunk: messages.MessagesChunk,
66+
) -> typing.Iterable[tuple[int, dict | typing.Iterable[dict]]]:
67+
# yield (message_target_id, [elastic_action, ...]) pairs
6568
raise NotImplementedError
6669

6770
def before_chunk(
@@ -148,10 +151,17 @@ def pls_handle_messages_chunk(self, messages_chunk):
148151
_indexname = _response_body['_index']
149152
_is_done = _ok or (_op_type == 'delete' and _status == 404)
150153
if _is_done:
151-
_action_tracker.action_done(_indexname, _docid)
154+
_finished_message_id = _action_tracker.action_done(_indexname, _docid)
155+
if _finished_message_id is not None:
156+
yield messages.IndexMessageResponse(
157+
is_done=True,
158+
index_message=messages.IndexMessage(messages_chunk.message_type, _finished_message_id),
159+
status_code=HTTPStatus.OK.value,
160+
error_text=None,
161+
)
162+
_action_tracker.forget_message(_finished_message_id)
152163
else:
153164
_action_tracker.action_errored(_indexname, _docid)
154-
# yield error responses immediately
155165
yield messages.IndexMessageResponse(
156166
is_done=False,
157167
index_message=messages.IndexMessage(
@@ -161,16 +171,14 @@ def pls_handle_messages_chunk(self, messages_chunk):
161171
status_code=_status,
162172
error_text=str(_response_body),
163173
)
164-
self.after_chunk(messages_chunk, _indexnames)
165-
# yield successes after the whole chunk completes
166-
# (since one message may involve several actions)
167-
for _messageid in _action_tracker.all_done_messages():
174+
for _message_id in _action_tracker.remaining_done_messages():
168175
yield messages.IndexMessageResponse(
169176
is_done=True,
170-
index_message=messages.IndexMessage(messages_chunk.message_type, _messageid),
177+
index_message=messages.IndexMessage(messages_chunk.message_type, _message_id),
171178
status_code=HTTPStatus.OK.value,
172179
error_text=None,
173180
)
181+
self.after_chunk(messages_chunk, _indexnames)
174182

175183
# abstract method from IndexStrategy
176184
def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificIndex):
@@ -202,14 +210,18 @@ def _alias_for_keeping_live(self):
202210
def _elastic_actions_with_index(self, messages_chunk, indexnames, action_tracker: _ActionTracker):
203211
if not indexnames:
204212
raise ValueError('cannot index to no indexes')
205-
for _message_target_id, _elastic_action in self.build_elastic_actions(messages_chunk):
206-
_docid = _elastic_action['_id']
207-
for _indexname in indexnames:
208-
action_tracker.add_action(_message_target_id, _indexname, _docid)
209-
yield {
210-
**_elastic_action,
211-
'_index': _indexname,
212-
}
213+
for _message_target_id, _elastic_actions in self.build_elastic_actions(messages_chunk):
214+
if isinstance(_elastic_actions, dict): # allow a single action
215+
_elastic_actions = [_elastic_actions]
216+
for _elastic_action in _elastic_actions:
217+
_docid = _elastic_action['_id']
218+
for _indexname in indexnames:
219+
action_tracker.add_action(_message_target_id, _indexname, _docid)
220+
yield {
221+
**_elastic_action,
222+
'_index': _indexname,
223+
}
224+
action_tracker.done_scheduling(_message_target_id)
213225

214226
def _get_indexnames_for_alias(self, alias_name) -> set[str]:
215227
try:
@@ -371,24 +383,37 @@ class _ActionTracker:
371383
default_factory=lambda: collections.defaultdict(set),
372384
)
373385
errored_messageids: set[int] = dataclasses.field(default_factory=set)
386+
fully_scheduled_messageids: set[int] = dataclasses.field(default_factory=set)
374387

375388
def add_action(self, message_id: int, index_name: str, doc_id: str):
376389
self.messageid_by_docid[doc_id] = message_id
377390
self.actions_by_messageid[message_id].add((index_name, doc_id))
378391

379-
def action_done(self, index_name: str, doc_id: str):
380-
_messageid = self.messageid_by_docid[doc_id]
381-
_message_actions = self.actions_by_messageid[_messageid]
382-
_message_actions.discard((index_name, doc_id))
392+
def action_done(self, index_name: str, doc_id: str) -> int | None:
393+
_messageid = self.get_message_id(doc_id)
394+
_remaining_message_actions = self.actions_by_messageid[_messageid]
395+
_remaining_message_actions.discard((index_name, doc_id))
396+
# return the message id only if this was the last action for that message
397+
return (
398+
None
399+
if _remaining_message_actions or (_messageid not in self.fully_scheduled_messageids)
400+
else _messageid
401+
)
383402

384403
def action_errored(self, index_name: str, doc_id: str):
385404
_messageid = self.messageid_by_docid[doc_id]
386405
self.errored_messageids.add(_messageid)
387406

407+
def done_scheduling(self, message_id: int):
408+
self.fully_scheduled_messageids.add(message_id)
409+
410+
def forget_message(self, message_id: int):
411+
del self.actions_by_messageid[message_id]
412+
388413
def get_message_id(self, doc_id: str):
389414
return self.messageid_by_docid[doc_id]
390415

391-
def all_done_messages(self):
416+
def remaining_done_messages(self):
392417
for _messageid, _actions in self.actions_by_messageid.items():
393418
if _messageid not in self.errored_messageids:
394419
assert not _actions

share/search/index_strategy/trovesearch_denorm.py

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
Literal,
1212
)
1313

14+
import celery
1415
from django.conf import settings
1516
import elasticsearch8
1617
from primitive_metadata import primitive_rdf as rdf
@@ -154,15 +155,14 @@ def _paths_and_values_mappings(self):
154155

155156
# override method from Elastic8IndexStrategy
156157
def after_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Iterable[str]):
157-
# refresh to avoid delete-by-query conflicts
158-
self.es8_client.indices.refresh(index=','.join(indexnames))
159-
# delete any docs that belong to cards in this chunk but weren't touched by indexing
160-
self.es8_client.delete_by_query(
161-
index=list(indexnames),
162-
query={'bool': {'must': [
163-
{'terms': {'card.card_pk': messages_chunk.target_ids_chunk}},
164-
{'range': {'chunk_timestamp': {'lt': messages_chunk.timestamp}}},
165-
]}},
158+
task__delete_iri_value_scraps.apply_async(
159+
kwargs={
160+
'index_strategy_name': self.name,
161+
'indexnames': list(indexnames),
162+
'card_pks': messages_chunk.target_ids_chunk,
163+
'timestamp': messages_chunk.timestamp,
164+
},
165+
countdown=settings.ELASTICSEARCH['POST_INDEX_DELAY'],
166166
)
167167

168168
# abstract method from Elastic8IndexStrategy
@@ -173,12 +173,13 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk):
173173
_docbuilder = self._SourcedocBuilder(_indexcard_rdf, messages_chunk.timestamp)
174174
if not _docbuilder.should_skip(): # if skipped, will be deleted
175175
_indexcard_pk = _indexcard_rdf.indexcard_id
176-
for _doc_id, _doc in _docbuilder.build_docs():
177-
_index_action = self.build_index_action(
176+
yield _indexcard_pk, (
177+
self.build_index_action(
178178
doc_id=_doc_id,
179179
doc_source=_doc,
180180
)
181-
yield _indexcard_pk, _index_action
181+
for _doc_id, _doc in _docbuilder.build_docs()
182+
)
182183
_remaining_indexcard_pks.discard(_indexcard_pk)
183184
# delete any that were skipped for any reason
184185
for _indexcard_pk in _remaining_indexcard_pks:
@@ -279,7 +280,10 @@ def should_skip(self) -> bool:
279280

280281
def build_docs(self) -> Iterator[tuple[str, dict]]:
281282
# index once without `iri_value`
282-
yield self._doc_id(), {'card': self._card_subdoc}
283+
yield self._doc_id(), {
284+
'card': self._card_subdoc,
285+
'chunk_timestamp': self.chunk_timestamp,
286+
}
283287
for _iri in self._fullwalk.paths_by_iri:
284288
yield self._doc_id(_iri), {
285289
'card': self._card_subdoc,
@@ -888,3 +892,46 @@ def _any_query(queries: abc.Collection[dict]):
888892
(_query,) = queries
889893
return _query
890894
return {'bool': {'should': list(queries), 'minimum_should_match': 1}}
895+
896+
897+
@celery.shared_task(
898+
name='share.search.index_strategy.trovesearch_denorm.task__delete_iri_value_scraps',
899+
max_retries=None, # retries only on delete_by_query conflicts -- should work eventually!
900+
retry_backoff=True,
901+
bind=True, # for explicit retry
902+
)
903+
def task__delete_iri_value_scraps(
904+
task: celery.Task,
905+
index_strategy_name: str,
906+
card_pks: list[int],
907+
indexnames: list[str],
908+
timestamp: int,
909+
):
910+
'''followup task to delete value-docs no longer present
911+
912+
each time an index-card is updated, value-docs are created (or updated) for each iri value
913+
present in the card's contents -- if some values are absent from a later update, the
914+
corresponding docs will remain untouched
915+
916+
this task deletes those untouched value-docs after the index has refreshed at its own pace
917+
(allowing a slightly longer delay for items to _stop_ matching queries for removed values)
918+
'''
919+
from share.search.index_strategy import get_index_strategy
920+
_index_strategy = get_index_strategy(index_strategy_name)
921+
assert isinstance(_index_strategy, Elastic8IndexStrategy)
922+
# delete any docs that belong to cards in this chunk but weren't touched by indexing
923+
_delete_resp = _index_strategy.es8_client.delete_by_query(
924+
index=indexnames,
925+
query={'bool': {'must': [
926+
{'terms': {'card.card_pk': card_pks}},
927+
{'range': {'chunk_timestamp': {'lt': timestamp}}},
928+
]}},
929+
params={
930+
'slices': 'auto',
931+
'conflicts': 'proceed', # count conflicts instead of halting
932+
'request_cache': False,
933+
},
934+
)
935+
_conflict_count = _delete_resp.get('version_conflicts', 0)
936+
if _conflict_count > 0:
937+
raise task.retry()

share/search/messages.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def __init__(self, *, kombu_message=None):
142142
def ack(self):
143143
if self.kombu_message is None:
144144
raise exceptions.DaemonMessageError('ack! called DaemonMessage.ack() but there is nothing to ack')
145-
return self.kombu_message.ack()
145+
self.kombu_message.ack()
146146

147147
def requeue(self):
148148
if self.kombu_message is None:

tests/share/search/index_strategy/_with_real_services.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@ def setUp(self):
2424
super().setUp()
2525
self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups'))
2626
self.index_strategy = self.get_index_strategy()
27+
28+
def _fake_get_index_strategy(name):
29+
if self.index_strategy.name == name:
30+
return self.index_strategy
31+
raise ValueError(f'unknown index strategy in test: {name}')
32+
33+
self.enterContext(mock.patch(
34+
'share.search.index_strategy.get_index_strategy',
35+
new=_fake_get_index_strategy,
36+
))
2737
self.index_messenger = IndexMessenger(
2838
celery_app=celery_app,
2939
index_strategys=[self.index_strategy],
Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,29 @@
1-
from share.search.index_strategy.trovesearch_denorm import TrovesearchDenormIndexStrategy
1+
from unittest import mock
2+
3+
from share.search.index_strategy.trovesearch_denorm import (
4+
TrovesearchDenormIndexStrategy,
5+
task__delete_iri_value_scraps,
6+
)
27

38
from . import _common_trovesearch_tests
49

510

6-
class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests):
11+
class TestTrovesearchDenorm(_common_trovesearch_tests.CommonTrovesearchTests):
12+
def setUp(self):
13+
super().setUp()
14+
15+
# make the followup delete task eager
16+
def _fake_apply_async(*args, **kwargs):
17+
kwargs['countdown'] = 0 # don't wait
18+
task__delete_iri_value_scraps.apply(*args, **kwargs)
19+
self.enterContext(
20+
mock.patch.object(
21+
task__delete_iri_value_scraps,
22+
'apply_async',
23+
new=_fake_apply_async,
24+
)
25+
)
26+
727
# for RealElasticTestCase
828
def get_index_strategy(self):
929
return TrovesearchDenormIndexStrategy('test_trovesearch_denorm')

0 commit comments

Comments
 (0)