Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 0f971ca

Browse files
Update get_pdu to return the original, pristine EventBase (#13320)
Update `get_pdu` to return the untouched, pristine `EventBase` as it was originally seen over federation (no metadata added). Previously, we returned the same `event` reference that we stored in the cache which downstream code modified in place and added metadata like setting it as an `outlier` and essentially poisoned our cache. Now we always return a copy of the `event` so the original can stay pristine in our cache and re-used for the next cache call. Split out from #13205 As discussed at: - #13205 (comment) - #13205 (comment) Related to #12584. This PR doesn't fix that issue because it hits [`get_event` which exists from the local database before it tries to `get_pdu`](https://github.com/matrix-org/synapse/blob/7864f33e286dec22368dc0b11c06eebb1462a51e/synapse/federation/federation_client.py#L581-L594).
1 parent a1b62af commit 0f971ca

File tree

5 files changed

+233
-61
lines changed

5 files changed

+233
-61
lines changed

changelog.d/13320.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix `FederationClient.get_pdu()` returning events from the cache as `outliers` instead of original events we saw over federation.

synapse/federation/federation_client.py

Lines changed: 81 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
RoomVersion,
5454
RoomVersions,
5555
)
56-
from synapse.events import EventBase, builder
56+
from synapse.events import EventBase, builder, make_event_from_dict
5757
from synapse.federation.federation_base import (
5858
FederationBase,
5959
InvalidEventSignatureError,
@@ -299,7 +299,8 @@ async def get_pdu_from_destination_raw(
299299
moving to the next destination. None indicates no timeout.
300300
301301
Returns:
302-
The requested PDU, or None if we were unable to find it.
302+
A copy of the requested PDU that is safe to modify, or None if we
303+
were unable to find it.
303304
304305
Raises:
305306
SynapseError, NotRetryingDestination, FederationDeniedError
@@ -309,7 +310,7 @@ async def get_pdu_from_destination_raw(
309310
)
310311

311312
logger.debug(
312-
"retrieved event id %s from %s: %r",
313+
"get_pdu_from_destination_raw: retrieved event id %s from %s: %r",
313314
event_id,
314315
destination,
315316
transaction_data,
@@ -358,54 +359,92 @@ async def get_pdu(
358359
The requested PDU, or None if we were unable to find it.
359360
"""
360361

361-
# TODO: Rate limit the number of times we try and get the same event.
362+
logger.debug(
363+
"get_pdu: event_id=%s from destinations=%s", event_id, destinations
364+
)
362365

363-
ev = self._get_pdu_cache.get(event_id)
364-
if ev:
365-
return ev
366+
# TODO: Rate limit the number of times we try and get the same event.
366367

367-
pdu_attempts = self.pdu_destination_tried.setdefault(event_id, {})
368+
# We might need the same event multiple times in quick succession (before
369+
# it gets persisted to the database), so we cache the results of the lookup.
370+
# Note that this is separate to the regular get_event cache which caches
371+
# events once they have been persisted.
372+
event = self._get_pdu_cache.get(event_id)
373+
374+
# If we don't see the event in the cache, go try to fetch it from the
375+
# provided remote federated destinations
376+
if not event:
377+
pdu_attempts = self.pdu_destination_tried.setdefault(event_id, {})
378+
379+
for destination in destinations:
380+
now = self._clock.time_msec()
381+
last_attempt = pdu_attempts.get(destination, 0)
382+
if last_attempt + PDU_RETRY_TIME_MS > now:
383+
logger.debug(
384+
"get_pdu: skipping destination=%s because we tried it recently last_attempt=%s and we only check every %s (now=%s)",
385+
destination,
386+
last_attempt,
387+
PDU_RETRY_TIME_MS,
388+
now,
389+
)
390+
continue
391+
392+
try:
393+
event = await self.get_pdu_from_destination_raw(
394+
destination=destination,
395+
event_id=event_id,
396+
room_version=room_version,
397+
timeout=timeout,
398+
)
368399

369-
signed_pdu = None
370-
for destination in destinations:
371-
now = self._clock.time_msec()
372-
last_attempt = pdu_attempts.get(destination, 0)
373-
if last_attempt + PDU_RETRY_TIME_MS > now:
374-
continue
400+
pdu_attempts[destination] = now
375401

376-
try:
377-
signed_pdu = await self.get_pdu_from_destination_raw(
378-
destination=destination,
379-
event_id=event_id,
380-
room_version=room_version,
381-
timeout=timeout,
382-
)
402+
if event:
403+
# Prime the cache
404+
self._get_pdu_cache[event.event_id] = event
383405

384-
pdu_attempts[destination] = now
406+
# FIXME: We should add a `break` here to avoid calling every
407+
# destination after we already found a PDU (will follow-up
408+
# in a separate PR)
385409

386-
except SynapseError as e:
387-
logger.info(
388-
"Failed to get PDU %s from %s because %s", event_id, destination, e
389-
)
390-
continue
391-
except NotRetryingDestination as e:
392-
logger.info(str(e))
393-
continue
394-
except FederationDeniedError as e:
395-
logger.info(str(e))
396-
continue
397-
except Exception as e:
398-
pdu_attempts[destination] = now
410+
except SynapseError as e:
411+
logger.info(
412+
"Failed to get PDU %s from %s because %s",
413+
event_id,
414+
destination,
415+
e,
416+
)
417+
continue
418+
except NotRetryingDestination as e:
419+
logger.info(str(e))
420+
continue
421+
except FederationDeniedError as e:
422+
logger.info(str(e))
423+
continue
424+
except Exception as e:
425+
pdu_attempts[destination] = now
426+
427+
logger.info(
428+
"Failed to get PDU %s from %s because %s",
429+
event_id,
430+
destination,
431+
e,
432+
)
433+
continue
399434

400-
logger.info(
401-
"Failed to get PDU %s from %s because %s", event_id, destination, e
402-
)
403-
continue
435+
if not event:
436+
return None
404437

405-
if signed_pdu:
406-
self._get_pdu_cache[event_id] = signed_pdu
438+
# `event` now refers to an object stored in `get_pdu_cache`. Our
439+
# callers may need to modify the returned object (eg to set
440+
# `event.internal_metadata.outlier = true`), so we return a copy
441+
# rather than the original object.
442+
event_copy = make_event_from_dict(
443+
event.get_pdu_json(),
444+
event.room_version,
445+
)
407446

408-
return signed_pdu
447+
return event_copy
409448

410449
async def get_room_state_ids(
411450
self, destination: str, room_id: str, event_id: str

synapse/handlers/federation_event.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -766,10 +766,24 @@ async def _process_pulled_event(
766766
"""
767767
logger.info("Processing pulled event %s", event)
768768

769-
# these should not be outliers.
770-
assert (
771-
not event.internal_metadata.is_outlier()
772-
), "pulled event unexpectedly flagged as outlier"
769+
# This function should not be used to persist outliers (use something
770+
# else) because this does a bunch of operations that aren't necessary
771+
# (extra work; in particular, it makes sure we have all the prev_events
772+
# and resolves the state across those prev events). If you happen to run
773+
# into a situation where the event you're trying to process/backfill is
774+
# marked as an `outlier`, then you should update that spot to return an
775+
# `EventBase` copy that doesn't have `outlier` flag set.
776+
#
777+
# `EventBase` is used to represent both an event we have not yet
778+
# persisted, and one that we have persisted and now keep in the cache.
779+
# In an ideal world this method would only be called with the first type
780+
# of event, but it turns out that's not actually the case and for
781+
# example, you could get an event from cache that is marked as an
782+
# `outlier` (fix up that spot though).
783+
assert not event.internal_metadata.is_outlier(), (
784+
"Outlier event passed to _process_pulled_event. "
785+
"To persist an event as a non-outlier, make sure to pass in a copy without `event.internal_metadata.outlier = true`."
786+
)
773787

774788
event_id = event.event_id
775789

synapse/storage/databases/main/events.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,9 +1346,24 @@ def _update_outliers_txn(
13461346
event_id: outlier for event_id, outlier in txn
13471347
}
13481348

1349+
logger.debug(
1350+
"_update_outliers_txn: events=%s have_persisted=%s",
1351+
[ev.event_id for ev, _ in events_and_contexts],
1352+
have_persisted,
1353+
)
1354+
13491355
to_remove = set()
13501356
for event, context in events_and_contexts:
1351-
if event.event_id not in have_persisted:
1357+
outlier_persisted = have_persisted.get(event.event_id)
1358+
logger.debug(
1359+
"_update_outliers_txn: event=%s outlier=%s outlier_persisted=%s",
1360+
event.event_id,
1361+
event.internal_metadata.is_outlier(),
1362+
outlier_persisted,
1363+
)
1364+
1365+
# Ignore events which we haven't persisted at all
1366+
if outlier_persisted is None:
13521367
continue
13531368

13541369
to_remove.add(event)
@@ -1358,7 +1373,6 @@ def _update_outliers_txn(
13581373
# was an outlier or not - what we have is at least as good.
13591374
continue
13601375

1361-
outlier_persisted = have_persisted[event.event_id]
13621376
if not event.internal_metadata.is_outlier() and outlier_persisted:
13631377
# We received a copy of an event that we had already stored as
13641378
# an outlier in the database. We now have some state at that event
@@ -1369,7 +1383,10 @@ def _update_outliers_txn(
13691383
# events down /sync. In general they will be historical events, so that
13701384
# doesn't matter too much, but that is not always the case.
13711385

1372-
logger.info("Updating state for ex-outlier event %s", event.event_id)
1386+
logger.info(
1387+
"_update_outliers_txn: Updating state for ex-outlier event %s",
1388+
event.event_id,
1389+
)
13731390

13741391
# insert into event_to_state_groups.
13751392
try:

0 commit comments

Comments
 (0)