Skip to content

Commit 3943d2f

Browse files
authored
Fix up logic for delaying sending read receipts over federation. (#17933)
For context of why we delay read receipts, see matrix-org/synapse#4730. Element Web often sends read receipts in quick succession, if it reloads the timeline it'll send one for the last message in the old timeline and again for the last message in the new timeline. This caused remote users to see a read receipt for older messages come through quickly, but then the second read receipt taking a while to arrive for the most recent message. There are two things going on in this PR: 1. There was a mismatch between seconds and milliseconds, and so we ended up delaying for far longer than intended. 2. Changing the logic to reuse the `DestinationWakeupQueue` (used for presence) The changes in logic are: - Treat the first receipt and subsequent receipts in a room in the same way - Whitelist certain classes of receipts to never delay being sent, i.e. receipts in small rooms, receipts for events that were sent within the last 60s, and sending receipts to the event sender's server. - The maximum delay a receipt can have before being sent to a server is 30s, and we'll send out receipts to remotes at least at 50Hz (by default) The upshot is that this should make receipts feel more snappy over federation. This new logic should send roughly between 10%–20% of transactions immediately on matrix.org.
1 parent 93cc955 commit 3943d2f

File tree

6 files changed

+220
-134
lines changed

6 files changed

+220
-134
lines changed

changelog.d/17933.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix long-standing bug where read receipts could get overly delayed being sent over federation.

synapse/federation/sender/__init__.py

Lines changed: 88 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@
140140
Iterable,
141141
List,
142142
Optional,
143-
Set,
144143
Tuple,
145144
)
146145

@@ -170,7 +169,13 @@
170169
run_as_background_process,
171170
wrap_as_background_process,
172171
)
173-
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken, StrCollection
172+
from synapse.types import (
173+
JsonDict,
174+
ReadReceipt,
175+
RoomStreamToken,
176+
StrCollection,
177+
get_domain_from_id,
178+
)
174179
from synapse.util import Clock
175180
from synapse.util.metrics import Measure
176181
from synapse.util.retryutils import filter_destinations_by_retry_limiter
@@ -297,12 +302,10 @@ class _DestinationWakeupQueue:
297302
# being woken up.
298303
_MAX_TIME_IN_QUEUE = 30.0
299304

300-
# The maximum duration in seconds between waking up consecutive destination
301-
# queues.
302-
_MAX_DELAY = 0.1
303-
304305
sender: "FederationSender" = attr.ib()
305306
clock: Clock = attr.ib()
307+
max_delay_s: int = attr.ib()
308+
306309
queue: "OrderedDict[str, Literal[None]]" = attr.ib(factory=OrderedDict)
307310
processing: bool = attr.ib(default=False)
308311

@@ -332,7 +335,7 @@ async def _handle(self) -> None:
332335
# We also add an upper bound to the delay, to gracefully handle the
333336
# case where the queue only has a few entries in it.
334337
current_sleep_seconds = min(
335-
self._MAX_DELAY, self._MAX_TIME_IN_QUEUE / len(self.queue)
338+
self.max_delay_s, self._MAX_TIME_IN_QUEUE / len(self.queue)
336339
)
337340

338341
while self.queue:
@@ -416,19 +419,14 @@ def __init__(self, hs: "HomeServer"):
416419
self._is_processing = False
417420
self._last_poked_id = -1
418421

419-
# map from room_id to a set of PerDestinationQueues which we believe are
420-
# awaiting a call to flush_read_receipts_for_room. The presence of an entry
421-
# here for a given room means that we are rate-limiting RR flushes to that room,
422-
# and that there is a pending call to _flush_rrs_for_room in the system.
423-
self._queues_awaiting_rr_flush_by_room: Dict[str, Set[PerDestinationQueue]] = {}
422+
self._external_cache = hs.get_external_cache()
424423

425-
self._rr_txn_interval_per_room_ms = (
426-
1000.0
427-
/ hs.config.ratelimiting.federation_rr_transactions_per_room_per_second
424+
rr_txn_interval_per_room_s = (
425+
1.0 / hs.config.ratelimiting.federation_rr_transactions_per_room_per_second
426+
)
427+
self._destination_wakeup_queue = _DestinationWakeupQueue(
428+
self, self.clock, max_delay_s=rr_txn_interval_per_room_s
428429
)
429-
430-
self._external_cache = hs.get_external_cache()
431-
self._destination_wakeup_queue = _DestinationWakeupQueue(self, self.clock)
432430

433431
# Regularly wake up destinations that have outstanding PDUs to be caught up
434432
self.clock.looping_call_now(
@@ -745,37 +743,48 @@ async def send_read_receipt(self, receipt: ReadReceipt) -> None:
745743

746744
# Some background on the rate-limiting going on here.
747745
#
748-
# It turns out that if we attempt to send out RRs as soon as we get them from
749-
# a client, then we end up trying to do several hundred Hz of federation
750-
# transactions. (The number of transactions scales as O(N^2) on the size of a
751-
# room, since in a large room we have both more RRs coming in, and more servers
752-
# to send them to.)
746+
# It turns out that if we attempt to send out RRs as soon as we get them
747+
# from a client, then we end up trying to do several hundred Hz of
748+
# federation transactions. (The number of transactions scales as O(N^2)
749+
# on the size of a room, since in a large room we have both more RRs
750+
# coming in, and more servers to send them to.)
753751
#
754-
# This leads to a lot of CPU load, and we end up getting behind. The solution
755-
# currently adopted is as follows:
752+
# This leads to a lot of CPU load, and we end up getting behind. The
753+
# solution currently adopted is to differentiate between receipts and
754+
# destinations we should immediately send to, and those we can trickle
755+
# the receipts to.
756756
#
757-
# The first receipt in a given room is sent out immediately, at time T0. Any
758-
# further receipts are, in theory, batched up for N seconds, where N is calculated
759-
# based on the number of servers in the room to achieve a transaction frequency
760-
# of around 50Hz. So, for example, if there were 100 servers in the room, then
761-
# N would be 100 / 50Hz = 2 seconds.
757+
# The current logic is to send receipts out immediately if:
758+
# - the room is "small", i.e. there's only N servers to send receipts
759+
# to, and so sending out the receipts immediately doesn't cause too
760+
# much load; or
761+
# - the receipt is for an event that happened recently, as users
762+
# notice if receipts are delayed when they know other users are
763+
# currently reading the room; or
764+
# - the receipt is being sent to the server that sent the event, so
765+
# that users see receipts for their own receipts quickly.
762766
#
763-
# Then, after T+N, we flush out any receipts that have accumulated, and restart
764-
# the timer to flush out more receipts at T+2N, etc. If no receipts accumulate,
765-
# we stop the cycle and go back to the start.
767+
# For destinations that we should delay sending the receipt to, we queue
768+
# the receipts up to be sent in the next transaction, but don't trigger
769+
# a new transaction to be sent. We then add the destination to the
770+
# `DestinationWakeupQueue`, which will slowly iterate over each
771+
# destination and trigger a new transaction to be sent.
766772
#
767-
# However, in practice, it is often possible to flush out receipts earlier: in
768-
# particular, if we are sending a transaction to a given server anyway (for
769-
# example, because we have a PDU or a RR in another room to send), then we may
770-
# as well send out all of the pending RRs for that server. So it may be that
771-
# by the time we get to T+N, we don't actually have any RRs left to send out.
772-
# Nevertheless we continue to buffer up RRs for the room in question until we
773-
# reach the point that no RRs arrive between timer ticks.
773+
# However, in practice, it is often possible to send out delayed
774+
# receipts earlier: in particular, if we are sending a transaction to a
775+
# given server anyway (for example, because we have a PDU or a RR in
776+
# another room to send), then we may as well send out all of the pending
777+
# RRs for that server. So it may be that by the time we get to waking up
778+
# the destination, we don't actually have any RRs left to send out.
774779
#
775-
# For even more background, see https://github.com/matrix-org/synapse/issues/4730.
780+
# For even more background, see
781+
# https://github.com/matrix-org/synapse/issues/4730.
776782

777783
room_id = receipt.room_id
778784

785+
# Local read receipts always have 1 event ID.
786+
event_id = receipt.event_ids[0]
787+
779788
# Work out which remote servers should be poked and poke them.
780789
domains_set = await self._storage_controllers.state.get_current_hosts_in_room_or_partial_state_approximation(
781790
room_id
@@ -797,49 +806,51 @@ async def send_read_receipt(self, receipt: ReadReceipt) -> None:
797806
if not domains:
798807
return
799808

800-
queues_pending_flush = self._queues_awaiting_rr_flush_by_room.get(room_id)
809+
# We now split which domains we want to wake up immediately vs which we
810+
# want to delay waking up.
811+
immediate_domains: StrCollection
812+
delay_domains: StrCollection
801813

802-
# if there is no flush yet scheduled, we will send out these receipts with
803-
# immediate flushes, and schedule the next flush for this room.
804-
if queues_pending_flush is not None:
805-
logger.debug("Queuing receipt for: %r", domains)
814+
if len(domains) < 10:
815+
# For "small" rooms send to all domains immediately
816+
immediate_domains = domains
817+
delay_domains = ()
806818
else:
807-
logger.debug("Sending receipt to: %r", domains)
808-
self._schedule_rr_flush_for_room(room_id, len(domains))
819+
metadata = await self.store.get_metadata_for_event(
820+
receipt.room_id, event_id
821+
)
822+
assert metadata is not None
809823

810-
for domain in domains:
811-
queue = self._get_per_destination_queue(domain)
812-
queue.queue_read_receipt(receipt)
824+
sender_domain = get_domain_from_id(metadata.sender)
813825

814-
# if there is already a RR flush pending for this room, then make sure this
815-
# destination is registered for the flush
816-
if queues_pending_flush is not None:
817-
queues_pending_flush.add(queue)
826+
if self.clock.time_msec() - metadata.received_ts < 60_000:
827+
# We always send receipts for recent messages immediately
828+
immediate_domains = domains
829+
delay_domains = ()
818830
else:
819-
queue.flush_read_receipts_for_room(room_id)
820-
821-
def _schedule_rr_flush_for_room(self, room_id: str, n_domains: int) -> None:
822-
# that is going to cause approximately len(domains) transactions, so now back
823-
# off for that multiplied by RR_TXN_INTERVAL_PER_ROOM
824-
backoff_ms = self._rr_txn_interval_per_room_ms * n_domains
825-
826-
logger.debug("Scheduling RR flush in %s in %d ms", room_id, backoff_ms)
827-
self.clock.call_later(backoff_ms, self._flush_rrs_for_room, room_id)
828-
self._queues_awaiting_rr_flush_by_room[room_id] = set()
829-
830-
def _flush_rrs_for_room(self, room_id: str) -> None:
831-
queues = self._queues_awaiting_rr_flush_by_room.pop(room_id)
832-
logger.debug("Flushing RRs in %s to %s", room_id, queues)
833-
834-
if not queues:
835-
# no more RRs arrived for this room; we are done.
836-
return
831+
# Otherwise, we delay waking up all destinations except for the
832+
# sender's domain.
833+
immediate_domains = []
834+
delay_domains = []
835+
for domain in domains:
836+
if domain == sender_domain:
837+
immediate_domains.append(domain)
838+
else:
839+
delay_domains.append(domain)
840+
841+
for domain in immediate_domains:
842+
# Add to destination queue and wake the destination up
843+
queue = self._get_per_destination_queue(domain)
844+
queue.queue_read_receipt(receipt)
845+
queue.attempt_new_transaction()
837846

838-
# schedule the next flush
839-
self._schedule_rr_flush_for_room(room_id, len(queues))
847+
for domain in delay_domains:
848+
# Add to destination queue...
849+
queue = self._get_per_destination_queue(domain)
850+
queue.queue_read_receipt(receipt)
840851

841-
for queue in queues:
842-
queue.flush_read_receipts_for_room(room_id)
852+
# ... and schedule the destination to be woken up.
853+
self._destination_wakeup_queue.add_to_queue(domain)
843854

844855
async def send_presence_to_destinations(
845856
self, states: Iterable[UserPresenceState], destinations: Iterable[str]

synapse/federation/sender/per_destination_queue.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@ def __init__(
156156
# Each receipt can only have a single receipt per
157157
# (room ID, receipt type, user ID, thread ID) tuple.
158158
self._pending_receipt_edus: List[Dict[str, Dict[str, Dict[str, dict]]]] = []
159-
self._rrs_pending_flush = False
160159

161160
# stream_id of last successfully sent to-device message.
162161
# NB: may be a long or an int.
@@ -258,15 +257,7 @@ def queue_read_receipt(self, receipt: ReadReceipt) -> None:
258257
}
259258
)
260259

261-
def flush_read_receipts_for_room(self, room_id: str) -> None:
262-
# If there are any pending receipts for this room then force-flush them
263-
# in a new transaction.
264-
for edu in self._pending_receipt_edus:
265-
if room_id in edu:
266-
self._rrs_pending_flush = True
267-
self.attempt_new_transaction()
268-
# No use in checking remaining EDUs if the room was found.
269-
break
260+
self.mark_new_data()
270261

271262
def send_keyed_edu(self, edu: Edu, key: Hashable) -> None:
272263
self._pending_edus_keyed[(edu.edu_type, key)] = edu
@@ -603,12 +594,9 @@ async def _catch_up_transmission_loop(self) -> None:
603594
self._destination, last_successful_stream_ordering
604595
)
605596

606-
def _get_receipt_edus(self, force_flush: bool, limit: int) -> Iterable[Edu]:
597+
def _get_receipt_edus(self, limit: int) -> Iterable[Edu]:
607598
if not self._pending_receipt_edus:
608599
return
609-
if not force_flush and not self._rrs_pending_flush:
610-
# not yet time for this lot
611-
return
612600

613601
# Send at most limit EDUs for receipts.
614602
for content in self._pending_receipt_edus[:limit]:
@@ -747,7 +735,7 @@ async def __aenter__(self) -> Tuple[List[EventBase], List[Edu]]:
747735
)
748736

749737
# Add read receipt EDUs.
750-
pending_edus.extend(self.queue._get_receipt_edus(force_flush=False, limit=5))
738+
pending_edus.extend(self.queue._get_receipt_edus(limit=5))
751739
edu_limit = MAX_EDUS_PER_TRANSACTION - len(pending_edus)
752740

753741
# Next, prioritize to-device messages so that existing encryption channels
@@ -795,13 +783,6 @@ async def __aenter__(self) -> Tuple[List[EventBase], List[Edu]]:
795783
if not self._pdus and not pending_edus:
796784
return [], []
797785

798-
# if we've decided to send a transaction anyway, and we have room, we
799-
# may as well send any pending RRs
800-
if edu_limit:
801-
pending_edus.extend(
802-
self.queue._get_receipt_edus(force_flush=True, limit=edu_limit)
803-
)
804-
805786
if self._pdus:
806787
self._last_stream_ordering = self._pdus[
807788
-1

synapse/storage/databases/main/cache.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ def _invalidate_caches_for_event(
322322
self._attempt_to_invalidate_cache(
323323
"get_unread_event_push_actions_by_room_for_user", (room_id,)
324324
)
325+
self._attempt_to_invalidate_cache("get_metadata_for_event", (room_id, event_id))
325326

326327
self._attempt_to_invalidate_cache("_get_max_event_pos", (room_id,))
327328

@@ -446,6 +447,7 @@ def _invalidate_caches_for_room_events(self, room_id: str) -> None:
446447
self._attempt_to_invalidate_cache("_get_state_group_for_event", None)
447448

448449
self._attempt_to_invalidate_cache("get_event_ordering", None)
450+
self._attempt_to_invalidate_cache("get_metadata_for_event", (room_id,))
449451
self._attempt_to_invalidate_cache("is_partial_state_event", None)
450452
self._attempt_to_invalidate_cache("_get_joined_profile_from_event_id", None)
451453

synapse/storage/databases/main/events_worker.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,14 @@ class _EventRow:
193193
outlier: bool
194194

195195

196+
@attr.s(slots=True, frozen=True, auto_attribs=True)
197+
class EventMetadata:
198+
"""Event metadata returned by `get_metadata_for_event(..)`"""
199+
200+
sender: str
201+
received_ts: int
202+
203+
196204
class EventRedactBehaviour(Enum):
197205
"""
198206
What to do when retrieving a redacted event from the database.
@@ -2580,3 +2588,22 @@ async def have_finished_sliding_sync_background_jobs(self) -> bool:
25802588
_BackgroundUpdates.SLIDING_SYNC_MEMBERSHIP_SNAPSHOTS_BG_UPDATE,
25812589
)
25822590
)
2591+
2592+
@cached(tree=True)
2593+
async def get_metadata_for_event(
2594+
self, room_id: str, event_id: str
2595+
) -> Optional[EventMetadata]:
2596+
row = await self.db_pool.simple_select_one(
2597+
table="events",
2598+
keyvalues={"room_id": room_id, "event_id": event_id},
2599+
retcols=("sender", "received_ts"),
2600+
allow_none=True,
2601+
desc="get_metadata_for_event",
2602+
)
2603+
if row is None:
2604+
return None
2605+
2606+
return EventMetadata(
2607+
sender=row[0],
2608+
received_ts=row[1],
2609+
)

0 commit comments

Comments
 (0)