Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 957e3d7

Browse files
Keep track when we try and fail to process a pulled event (#13589)
We can follow-up this PR with: 1. Only try to backfill from an event if we haven't tried recently -> #13622 1. When we decide to backfill that event again, process it in the background so it doesn't block and make `/messages` slow when we know it will probably fail again -> #13623 1. Generally track failures everywhere we try and fail to pull an event over federation -> #13700 Fix #13621 Part of #13356 Mentioned in [internal doc](https://docs.google.com/document/d/1lvUoVfYUiy6UaHB6Rb4HicjaJAU40-APue9Q4vzuW3c/edit#bookmark=id.qv7cj51sv9i5)
1 parent 666ae87 commit 957e3d7

File tree

7 files changed

+329
-9
lines changed

7 files changed

+329
-9
lines changed

changelog.d/13589.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Keep track when we attempt to backfill an event but fail so we can intelligently back-off in the future.

synapse/handlers/federation_event.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,6 +862,9 @@ async def _process_pulled_event(
862862
self._sanity_check_event(event)
863863
except SynapseError as err:
864864
logger.warning("Event %s failed sanity check: %s", event_id, err)
865+
await self._store.record_event_failed_pull_attempt(
866+
event.room_id, event_id, str(err)
867+
)
865868
return
866869

867870
try:
@@ -897,6 +900,10 @@ async def _process_pulled_event(
897900
backfilled=backfilled,
898901
)
899902
except FederationError as e:
903+
await self._store.record_event_failed_pull_attempt(
904+
event.room_id, event_id, str(e)
905+
)
906+
900907
if e.code == 403:
901908
logger.warning("Pulled event %s failed history check.", event_id)
902909
else:

synapse/storage/databases/main/event_federation.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1294,6 +1294,51 @@ def _get_backfill_events(
12941294

12951295
return event_id_results
12961296

1297+
@trace
1298+
async def record_event_failed_pull_attempt(
1299+
self, room_id: str, event_id: str, cause: str
1300+
) -> None:
1301+
"""
1302+
Record when we fail to pull an event over federation.
1303+
1304+
This information allows us to be more intelligent when we decide to
1305+
retry (we don't need to fail over and over) and we can process that
1306+
event in the background so we don't block on it each time.
1307+
1308+
Args:
1309+
room_id: The room where the event failed to pull from
1310+
event_id: The event that failed to be fetched or processed
1311+
cause: The error message or reason that we failed to pull the event
1312+
"""
1313+
await self.db_pool.runInteraction(
1314+
"record_event_failed_pull_attempt",
1315+
self._record_event_failed_pull_attempt_upsert_txn,
1316+
room_id,
1317+
event_id,
1318+
cause,
1319+
db_autocommit=True, # Safe as it's a single upsert
1320+
)
1321+
1322+
def _record_event_failed_pull_attempt_upsert_txn(
1323+
self,
1324+
txn: LoggingTransaction,
1325+
room_id: str,
1326+
event_id: str,
1327+
cause: str,
1328+
) -> None:
1329+
sql = """
1330+
INSERT INTO event_failed_pull_attempts (
1331+
room_id, event_id, num_attempts, last_attempt_ts, last_cause
1332+
)
1333+
VALUES (?, ?, ?, ?, ?)
1334+
ON CONFLICT (room_id, event_id) DO UPDATE SET
1335+
num_attempts=event_failed_pull_attempts.num_attempts + 1,
1336+
last_attempt_ts=EXCLUDED.last_attempt_ts,
1337+
last_cause=EXCLUDED.last_cause;
1338+
"""
1339+
1340+
txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause))
1341+
12971342
async def get_missing_events(
12981343
self,
12991344
room_id: str,

synapse/storage/databases/main/events.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,17 +2435,31 @@ def _update_backward_extremeties(
24352435
"DELETE FROM event_backward_extremities"
24362436
" WHERE event_id = ? AND room_id = ?"
24372437
)
2438+
backward_extremity_tuples_to_remove = [
2439+
(ev.event_id, ev.room_id)
2440+
for ev in events
2441+
if not ev.internal_metadata.is_outlier()
2442+
# If we encountered an event with no prev_events, then we might
2443+
# as well remove it now because it won't ever have anything else
2444+
# to backfill from.
2445+
or len(ev.prev_event_ids()) == 0
2446+
]
24382447
txn.execute_batch(
24392448
query,
2440-
[
2441-
(ev.event_id, ev.room_id)
2442-
for ev in events
2443-
if not ev.internal_metadata.is_outlier()
2444-
# If we encountered an event with no prev_events, then we might
2445-
# as well remove it now because it won't ever have anything else
2446-
# to backfill from.
2447-
or len(ev.prev_event_ids()) == 0
2448-
],
2449+
backward_extremity_tuples_to_remove,
2450+
)
2451+
2452+
# Clear out the failed backfill attempts after we successfully pulled
2453+
# the event. Since we no longer need these events as backward
2454+
# extremities, it also means that they won't be backfilled from again so
2455+
# we no longer need to store the backfill attempts around it.
2456+
query = """
2457+
DELETE FROM event_failed_pull_attempts
2458+
WHERE event_id = ? and room_id = ?
2459+
"""
2460+
txn.execute_batch(
2461+
query,
2462+
backward_extremity_tuples_to_remove,
24492463
)
24502464

24512465

synapse/storage/schema/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@
8181
Changes in SCHEMA_VERSION = 73;
8282
- thread_id column is added to event_push_actions, event_push_actions_staging
8383
event_push_summary, receipts_linearized, and receipts_graph.
84+
- Add table `event_failed_pull_attempts` to keep track when we fail to pull
85+
events over federation.
8486
"""
8587

8688

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/* Copyright 2022 The Matrix.org Foundation C.I.C
2+
*
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
17+
-- Add a table that keeps track of when we failed to pull an event over
18+
-- federation (via /backfill, `/event`, `/get_missing_events`, etc). This allows
19+
-- us to be more intelligent when we decide to retry (we don't need to fail over
20+
-- and over) and we can process that event in the background so we don't block
21+
-- on it each time.
22+
CREATE TABLE IF NOT EXISTS event_failed_pull_attempts(
23+
room_id TEXT NOT NULL REFERENCES rooms (room_id),
24+
event_id TEXT NOT NULL,
25+
num_attempts INT NOT NULL,
26+
last_attempt_ts BIGINT NOT NULL,
27+
last_cause TEXT NOT NULL,
28+
PRIMARY KEY (room_id, event_id)
29+
);

tests/handlers/test_federation_event.py

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,225 @@ async def get_event(destination: str, event_id: str, timeout=None):
227227

228228
if prev_exists_as_outlier:
229229
self.mock_federation_transport_client.get_event.assert_not_called()
230+
231+
def test_process_pulled_event_records_failed_backfill_attempts(
232+
self,
233+
) -> None:
234+
"""
235+
Test to make sure that failed backfill attempts for an event are
236+
recorded in the `event_failed_pull_attempts` table.
237+
238+
In this test, we pretend we are processing a "pulled" event via
239+
backfill. The pulled event has a fake `prev_event` which our server has
240+
obviously never seen before so it attempts to request the state at that
241+
`prev_event` which expectedly fails because it's a fake event. Because
242+
the server can't fetch the state at the missing `prev_event`, the
243+
"pulled" event fails the history check and is fails to process.
244+
245+
We check that we correctly record the number of failed pull attempts
246+
of the pulled event and as a sanity check, that the "pulled" event isn't
247+
persisted.
248+
"""
249+
OTHER_USER = f"@user:{self.OTHER_SERVER_NAME}"
250+
main_store = self.hs.get_datastores().main
251+
252+
# Create the room
253+
user_id = self.register_user("kermit", "test")
254+
tok = self.login("kermit", "test")
255+
room_id = self.helper.create_room_as(room_creator=user_id, tok=tok)
256+
room_version = self.get_success(main_store.get_room_version(room_id))
257+
258+
# We expect an outbound request to /state_ids, so stub that out
259+
self.mock_federation_transport_client.get_room_state_ids.return_value = make_awaitable(
260+
{
261+
# Mimic the other server not knowing about the state at all.
262+
# We want to cause Synapse to throw an error (`Unable to get
263+
# missing prev_event $fake_prev_event`) and fail to backfill
264+
# the pulled event.
265+
"pdu_ids": [],
266+
"auth_chain_ids": [],
267+
}
268+
)
269+
# We also expect an outbound request to /state
270+
self.mock_federation_transport_client.get_room_state.return_value = make_awaitable(
271+
StateRequestResponse(
272+
# Mimic the other server not knowing about the state at all.
273+
# We want to cause Synapse to throw an error (`Unable to get
274+
# missing prev_event $fake_prev_event`) and fail to backfill
275+
# the pulled event.
276+
auth_events=[],
277+
state=[],
278+
)
279+
)
280+
281+
pulled_event = make_event_from_dict(
282+
self.add_hashes_and_signatures_from_other_server(
283+
{
284+
"type": "test_regular_type",
285+
"room_id": room_id,
286+
"sender": OTHER_USER,
287+
"prev_events": [
288+
# The fake prev event will make the pulled event fail
289+
# the history check (`Unable to get missing prev_event
290+
# $fake_prev_event`)
291+
"$fake_prev_event"
292+
],
293+
"auth_events": [],
294+
"origin_server_ts": 1,
295+
"depth": 12,
296+
"content": {"body": "pulled"},
297+
}
298+
),
299+
room_version,
300+
)
301+
302+
# The function under test: try to process the pulled event
303+
with LoggingContext("test"):
304+
self.get_success(
305+
self.hs.get_federation_event_handler()._process_pulled_event(
306+
self.OTHER_SERVER_NAME, pulled_event, backfilled=True
307+
)
308+
)
309+
310+
# Make sure our failed pull attempt was recorded
311+
backfill_num_attempts = self.get_success(
312+
main_store.db_pool.simple_select_one_onecol(
313+
table="event_failed_pull_attempts",
314+
keyvalues={"event_id": pulled_event.event_id},
315+
retcol="num_attempts",
316+
)
317+
)
318+
self.assertEqual(backfill_num_attempts, 1)
319+
320+
# The function under test: try to process the pulled event again
321+
with LoggingContext("test"):
322+
self.get_success(
323+
self.hs.get_federation_event_handler()._process_pulled_event(
324+
self.OTHER_SERVER_NAME, pulled_event, backfilled=True
325+
)
326+
)
327+
328+
# Make sure our second failed pull attempt was recorded (`num_attempts` was incremented)
329+
backfill_num_attempts = self.get_success(
330+
main_store.db_pool.simple_select_one_onecol(
331+
table="event_failed_pull_attempts",
332+
keyvalues={"event_id": pulled_event.event_id},
333+
retcol="num_attempts",
334+
)
335+
)
336+
self.assertEqual(backfill_num_attempts, 2)
337+
338+
# And as a sanity check, make sure the event was not persisted through all of this.
339+
persisted = self.get_success(
340+
main_store.get_event(pulled_event.event_id, allow_none=True)
341+
)
342+
self.assertIsNone(
343+
persisted,
344+
"pulled event that fails the history check should not be persisted at all",
345+
)
346+
347+
def test_process_pulled_event_clears_backfill_attempts_after_being_successfully_persisted(
348+
self,
349+
) -> None:
350+
"""
351+
Test to make sure that failed pull attempts
352+
(`event_failed_pull_attempts` table) for an event are cleared after the
353+
event is successfully persisted.
354+
355+
In this test, we pretend we are processing a "pulled" event via
356+
backfill. The pulled event succesfully processes and the backward
357+
extremeties are updated along with clearing out any failed pull attempts
358+
for those old extremities.
359+
360+
We check that we correctly cleared failed pull attempts of the
361+
pulled event.
362+
"""
363+
OTHER_USER = f"@user:{self.OTHER_SERVER_NAME}"
364+
main_store = self.hs.get_datastores().main
365+
366+
# Create the room
367+
user_id = self.register_user("kermit", "test")
368+
tok = self.login("kermit", "test")
369+
room_id = self.helper.create_room_as(room_creator=user_id, tok=tok)
370+
room_version = self.get_success(main_store.get_room_version(room_id))
371+
372+
# allow the remote user to send state events
373+
self.helper.send_state(
374+
room_id,
375+
"m.room.power_levels",
376+
{"events_default": 0, "state_default": 0},
377+
tok=tok,
378+
)
379+
380+
# add the remote user to the room
381+
member_event = self.get_success(
382+
event_injection.inject_member_event(self.hs, room_id, OTHER_USER, "join")
383+
)
384+
385+
initial_state_map = self.get_success(
386+
main_store.get_partial_current_state_ids(room_id)
387+
)
388+
389+
auth_event_ids = [
390+
initial_state_map[("m.room.create", "")],
391+
initial_state_map[("m.room.power_levels", "")],
392+
member_event.event_id,
393+
]
394+
395+
pulled_event = make_event_from_dict(
396+
self.add_hashes_and_signatures_from_other_server(
397+
{
398+
"type": "test_regular_type",
399+
"room_id": room_id,
400+
"sender": OTHER_USER,
401+
"prev_events": [member_event.event_id],
402+
"auth_events": auth_event_ids,
403+
"origin_server_ts": 1,
404+
"depth": 12,
405+
"content": {"body": "pulled"},
406+
}
407+
),
408+
room_version,
409+
)
410+
411+
# Fake the "pulled" event failing to backfill once so we can test
412+
# if it's cleared out later on.
413+
self.get_success(
414+
main_store.record_event_failed_pull_attempt(
415+
pulled_event.room_id, pulled_event.event_id, "fake cause"
416+
)
417+
)
418+
# Make sure we have a failed pull attempt recorded for the pulled event
419+
backfill_num_attempts = self.get_success(
420+
main_store.db_pool.simple_select_one_onecol(
421+
table="event_failed_pull_attempts",
422+
keyvalues={"event_id": pulled_event.event_id},
423+
retcol="num_attempts",
424+
)
425+
)
426+
self.assertEqual(backfill_num_attempts, 1)
427+
428+
# The function under test: try to process the pulled event
429+
with LoggingContext("test"):
430+
self.get_success(
431+
self.hs.get_federation_event_handler()._process_pulled_event(
432+
self.OTHER_SERVER_NAME, pulled_event, backfilled=True
433+
)
434+
)
435+
436+
# Make sure the failed pull attempts for the pulled event are cleared
437+
backfill_num_attempts = self.get_success(
438+
main_store.db_pool.simple_select_one_onecol(
439+
table="event_failed_pull_attempts",
440+
keyvalues={"event_id": pulled_event.event_id},
441+
retcol="num_attempts",
442+
allow_none=True,
443+
)
444+
)
445+
self.assertIsNone(backfill_num_attempts)
446+
447+
# And as a sanity check, make sure the "pulled" event was persisted.
448+
persisted = self.get_success(
449+
main_store.get_event(pulled_event.event_id, allow_none=True)
450+
)
451+
self.assertIsNotNone(persisted, "pulled event was not persisted at all")

0 commit comments

Comments
 (0)