-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Limit and filter the number of backfill points to get from the database #13879
Changes from 92 commits
e0d7fab
b8d55d3
f63d823
bec26e2
fee37c3
d1290be
f9119d0
f5c6fe7
16ebec0
ce07aa1
5811ba1
cf2b093
cbb4194
621c5d3
75c07bb
783cce5
54ef84b
7bf3e7f
37ff009
a58d191
f127ad1
18abbf4
23310f5
64e01d8
47bac25
2ebed9d
60b3b92
e9f603d
a8f1464
bbd1c94
dd1db25
c583eef
ea4a3ad
f495752
f2061b9
e4192d7
7a44932
86d98ca
29f584e
506a8dd
361ce5c
b09d8a2
965d142
267777f
d0cd42a
3d9f625
33a3c64
6736d10
6eba1d4
1464101
71c7738
df8c76d
d45b078
c939422
599e212
bc8046b
ea08006
9a85bb4
7204cce
8f214b1
33ad64e
63bec99
31d7502
0b5f1db
4ce7709
d3a1f84
1347686
57182dc
e58bc65
70019d2
46a1a20
91c5be0
7ea40b1
40ec8d8
47aa375
1208540
1738619
3f1b695
aae06fd
bae72e4
4f2b3ae
44e2a5d
685bbee
9921f24
7c96481
b7368ba
4823bb2
829cef8
db90283
aba8b0f
e727a31
379ccb4
21f5933
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Only pull relevant backfill points from the database based on the current depth and limit (instead of all) every time we want to `/backfill`. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,7 @@ | |
| from unpaddedbase64 import decode_base64 | ||
|
|
||
| from synapse import event_auth | ||
| from synapse.api.constants import EventContentFields, EventTypes, Membership | ||
| from synapse.api.constants import MAX_DEPTH, EventContentFields, EventTypes, Membership | ||
| from synapse.api.errors import ( | ||
| AuthError, | ||
| CodeMessageException, | ||
|
|
@@ -209,7 +209,7 @@ async def _maybe_backfill_inner( | |
| current_depth: int, | ||
| limit: int, | ||
| *, | ||
| processing_start_time: int, | ||
| processing_start_time: Optional[int], | ||
| ) -> bool: | ||
| """ | ||
| Checks whether the `current_depth` is at or approaching any backfill | ||
|
|
@@ -221,20 +221,36 @@ async def _maybe_backfill_inner( | |
| room_id: The room to backfill in. | ||
| current_depth: The depth to check at for any upcoming backfill points. | ||
| limit: The max number of events to request from the remote federated server. | ||
| processing_start_time: The time when `maybe_backfill` started | ||
| processing. Only used for timing. | ||
| processing_start_time: The time when `maybe_backfill` started processing. | ||
| Only used for timing. If `None`, no timing observation will be made. | ||
| """ | ||
| backwards_extremities = [ | ||
| _BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY) | ||
| for event_id, depth in await self.store.get_backfill_points_in_room(room_id) | ||
| for event_id, depth in await self.store.get_backfill_points_in_room( | ||
| room_id=room_id, | ||
| current_depth=current_depth, | ||
| # We only need to end up with 5 extremities combined with the | ||
| # insertion event extremities to make the `/backfill` request | ||
| # but fetch an order of magnitude more to make sure there is | ||
| # enough even after we filter them by whether visible in the | ||
| # history. This isn't fool-proof as all backfill points within | ||
| # our limit could be filtered out but seems like a good amount | ||
| # to try with at least. | ||
| limit=50, | ||
richvdh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ) | ||
| ] | ||
|
|
||
| insertion_events_to_be_backfilled: List[_BackfillPoint] = [] | ||
| if self.hs.config.experimental.msc2716_enabled: | ||
| insertion_events_to_be_backfilled = [ | ||
| _BackfillPoint(event_id, depth, _BackfillPointType.INSERTION_PONT) | ||
| for event_id, depth in await self.store.get_insertion_event_backward_extremities_in_room( | ||
| room_id | ||
| room_id=room_id, | ||
| current_depth=current_depth, | ||
| # We only need to end up with 5 extremities combined with | ||
| # the backfill points to make the `/backfill` request ... | ||
| # (see the other comment above for more context). | ||
| limit=50, | ||
| ) | ||
| ] | ||
| logger.debug( | ||
|
|
@@ -243,10 +259,6 @@ async def _maybe_backfill_inner( | |
| insertion_events_to_be_backfilled, | ||
| ) | ||
|
|
||
| if not backwards_extremities and not insertion_events_to_be_backfilled: | ||
| logger.debug("Not backfilling as no extremeties found.") | ||
| return False | ||
|
|
||
| # we now have a list of potential places to backpaginate from. We prefer to | ||
| # start with the most recent (ie, max depth), so let's sort the list. | ||
| sorted_backfill_points: List[_BackfillPoint] = sorted( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we could maybe use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @squahtx mentioned this in #13635 (comment) when I asked about any potential two already sorted lists optimizations. Do we want to do this though? In this PR, we're also adding the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤷 |
||
|
|
@@ -267,6 +279,33 @@ async def _maybe_backfill_inner( | |
| sorted_backfill_points, | ||
| ) | ||
|
|
||
| # If we have no backfill points lower than the `current_depth` then | ||
| # either we can a) bail or b) still attempt to backfill. We opt to try | ||
| # backfilling anyway just in case we do get relevant events. | ||
| if not sorted_backfill_points and current_depth != MAX_DEPTH: | ||
| logger.debug( | ||
| "_maybe_backfill_inner: all backfill points are *after* current depth. Trying again with later backfill points." | ||
| ) | ||
| return await self._maybe_backfill_inner( | ||
| room_id=room_id, | ||
| # We use `MAX_DEPTH` so that we find all backfill points next | ||
| # time (all events are below the `MAX_DEPTH`) | ||
| current_depth=MAX_DEPTH, | ||
| limit=limit, | ||
| # We don't want to start another timing observation from this | ||
| # nested recursive call. The top-most call can record the time | ||
| # overall otherwise the smaller one will throw off the results. | ||
| processing_start_time=None, | ||
| ) | ||
|
|
||
| # Even after recursing with `MAX_DEPTH`, we didn't find any | ||
| # backward extremities to backfill from. | ||
| if not sorted_backfill_points: | ||
| logger.debug( | ||
| "_maybe_backfill_inner: Not backfilling as no backward extremeties found." | ||
| ) | ||
| return False | ||
|
|
||
| # If we're approaching an extremity we trigger a backfill, otherwise we | ||
| # no-op. | ||
| # | ||
|
|
@@ -276,47 +315,16 @@ async def _maybe_backfill_inner( | |
| # chose more than one times the limit in case of failure, but choosing a | ||
| # much larger factor will result in triggering a backfill request much | ||
| # earlier than necessary. | ||
| # | ||
| # XXX: shouldn't we do this *after* the filter by depth below? Again, we don't | ||
| # care about events that have happened after our current position. | ||
| # | ||
| max_depth = sorted_backfill_points[0].depth | ||
| if current_depth - 2 * limit > max_depth: | ||
| max_depth_of_backfill_points = sorted_backfill_points[0].depth | ||
| if current_depth - 2 * limit > max_depth_of_backfill_points: | ||
| logger.debug( | ||
| "Not backfilling as we don't need to. %d < %d - 2 * %d", | ||
| max_depth, | ||
| max_depth_of_backfill_points, | ||
| current_depth, | ||
| limit, | ||
| ) | ||
| return False | ||
|
|
||
| # We ignore extremities that have a greater depth than our current depth | ||
| # as: | ||
| # 1. we don't really care about getting events that have happened | ||
| # after our current position; and | ||
| # 2. we have likely previously tried and failed to backfill from that | ||
| # extremity, so to avoid getting "stuck" requesting the same | ||
| # backfill repeatedly we drop those extremities. | ||
| # | ||
| # However, we need to check that the filtered extremities are non-empty. | ||
| # If they are empty then either we can a) bail or b) still attempt to | ||
| # backfill. We opt to try backfilling anyway just in case we do get | ||
| # relevant events. | ||
| # | ||
| filtered_sorted_backfill_points = [ | ||
| t for t in sorted_backfill_points if t.depth <= current_depth | ||
| ] | ||
| if filtered_sorted_backfill_points: | ||
| logger.debug( | ||
| "_maybe_backfill_inner: backfill points before current depth: %s", | ||
| filtered_sorted_backfill_points, | ||
| ) | ||
| sorted_backfill_points = filtered_sorted_backfill_points | ||
| else: | ||
| logger.debug( | ||
| "_maybe_backfill_inner: all backfill points are *after* current depth. Backfilling anyway." | ||
| ) | ||
|
|
||
| # For performance's sake, we only want to paginate from a particular extremity | ||
| # if we can actually see the events we'll get. Otherwise, we'd just spend a lot | ||
| # of resources to get redacted events. We check each extremity in turn and | ||
|
|
@@ -450,10 +458,15 @@ async def try_backfill(domains: Collection[str]) -> bool: | |
|
|
||
| return False | ||
|
|
||
| processing_end_time = self.clock.time_msec() | ||
| backfill_processing_before_timer.observe( | ||
| (processing_end_time - processing_start_time) / 1000 | ||
| ) | ||
| # If we have the `processing_start_time`, then we can make an | ||
| # observation. We wouldn't have the `processing_start_time` in the case | ||
| # where `_maybe_backfill_inner` is recursively called to find any | ||
| # backfill points regardless of `current_depth`. | ||
| if processing_start_time is not None: | ||
| processing_end_time = self.clock.time_msec() | ||
| backfill_processing_before_timer.observe( | ||
| (processing_end_time - processing_start_time) / 1000 | ||
| ) | ||
|
|
||
| success = await try_backfill(likely_domains) | ||
| if success: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.