Skip to content

Commit 01d45fe

Browse files
authored
Prune inbound federation queues if they get too long (matrix-org#10390)
1 parent ba5287f commit 01d45fe

File tree

4 files changed

+177
-2
lines changed

4 files changed

+177
-2
lines changed

changelog.d/10390.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Prune inbound federation inbound queues for a room if they get too large.

synapse/federation/federation_server.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,6 +1024,23 @@ async def _process_incoming_pdus_in_room_inner(
10241024

10251025
origin, event = next
10261026

1027+
# Prune the event queue if it's getting large.
1028+
#
1029+
# We do this *after* handling the first event as the common case is
1030+
# that the queue is empty (/has the single event in), and so there's
1031+
# no need to do this check.
1032+
pruned = await self.store.prune_staged_events_in_room(room_id, room_version)
1033+
if pruned:
1034+
# If we have pruned the queue check we need to refetch the next
1035+
# event to handle.
1036+
next = await self.store.get_next_staged_event_for_room(
1037+
room_id, room_version
1038+
)
1039+
if not next:
1040+
break
1041+
1042+
origin, event = next
1043+
10271044
lock = await self.store.try_acquire_lock(
10281045
_INBOUND_EVENT_HANDLING_LOCK_NAME, room_id
10291046
)

synapse/storage/databases/main/event_federation.py

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616
from queue import Empty, PriorityQueue
1717
from typing import Collection, Dict, Iterable, List, Optional, Set, Tuple
1818

19-
from prometheus_client import Gauge
19+
from prometheus_client import Counter, Gauge
2020

2121
from synapse.api.constants import MAX_DEPTH
2222
from synapse.api.errors import StoreError
23-
from synapse.api.room_versions import RoomVersion
23+
from synapse.api.room_versions import EventFormatVersions, RoomVersion
2424
from synapse.events import EventBase, make_event_from_dict
2525
from synapse.metrics.background_process_metrics import wrap_as_background_process
2626
from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
@@ -44,6 +44,12 @@
4444
"The total number of events in the inbound federation staging",
4545
)
4646

47+
pdus_pruned_from_federation_queue = Counter(
48+
"synapse_federation_server_number_inbound_pdu_pruned",
49+
"The number of events in the inbound federation staging that have been "
50+
"pruned due to the queue getting too long",
51+
)
52+
4753
logger = logging.getLogger(__name__)
4854

4955

@@ -1277,6 +1283,100 @@ def _get_next_staged_event_for_room_txn(txn):
12771283

12781284
return origin, event
12791285

1286+
async def prune_staged_events_in_room(
1287+
self,
1288+
room_id: str,
1289+
room_version: RoomVersion,
1290+
) -> bool:
1291+
"""Checks if there are lots of staged events for the room, and if so
1292+
prune them down.
1293+
1294+
Returns:
1295+
Whether any events were pruned
1296+
"""
1297+
1298+
# First check the size of the queue.
1299+
count = await self.db_pool.simple_select_one_onecol(
1300+
table="federation_inbound_events_staging",
1301+
keyvalues={"room_id": room_id},
1302+
retcol="COALESCE(COUNT(*), 0)",
1303+
desc="prune_staged_events_in_room_count",
1304+
)
1305+
1306+
if count < 100:
1307+
return False
1308+
1309+
# If the queue is too large, then we want clear the entire queue,
1310+
# keeping only the forward extremities (i.e. the events not referenced
1311+
# by other events in the queue). We do this so that we can always
1312+
# backpaginate in all the events we have dropped.
1313+
rows = await self.db_pool.simple_select_list(
1314+
table="federation_inbound_events_staging",
1315+
keyvalues={"room_id": room_id},
1316+
retcols=("event_id", "event_json"),
1317+
desc="prune_staged_events_in_room_fetch",
1318+
)
1319+
1320+
# Find the set of events referenced by those in the queue, as well as
1321+
# collecting all the event IDs in the queue.
1322+
referenced_events: Set[str] = set()
1323+
seen_events: Set[str] = set()
1324+
for row in rows:
1325+
event_id = row["event_id"]
1326+
seen_events.add(event_id)
1327+
event_d = db_to_json(row["event_json"])
1328+
1329+
# We don't bother parsing the dicts into full blown event objects,
1330+
# as that is needlessly expensive.
1331+
1332+
# We haven't checked that the `prev_events` have the right format
1333+
# yet, so we check as we go.
1334+
prev_events = event_d.get("prev_events", [])
1335+
if not isinstance(prev_events, list):
1336+
logger.info("Invalid prev_events for %s", event_id)
1337+
continue
1338+
1339+
if room_version.event_format == EventFormatVersions.V1:
1340+
for prev_event_tuple in prev_events:
1341+
if not isinstance(prev_event_tuple, list) or len(prev_events) != 2:
1342+
logger.info("Invalid prev_events for %s", event_id)
1343+
break
1344+
1345+
prev_event_id = prev_event_tuple[0]
1346+
if not isinstance(prev_event_id, str):
1347+
logger.info("Invalid prev_events for %s", event_id)
1348+
break
1349+
1350+
referenced_events.add(prev_event_id)
1351+
else:
1352+
for prev_event_id in prev_events:
1353+
if not isinstance(prev_event_id, str):
1354+
logger.info("Invalid prev_events for %s", event_id)
1355+
break
1356+
1357+
referenced_events.add(prev_event_id)
1358+
1359+
to_delete = referenced_events & seen_events
1360+
if not to_delete:
1361+
return False
1362+
1363+
pdus_pruned_from_federation_queue.inc(len(to_delete))
1364+
logger.info(
1365+
"Pruning %d events in room %s from federation queue",
1366+
len(to_delete),
1367+
room_id,
1368+
)
1369+
1370+
await self.db_pool.simple_delete_many(
1371+
table="federation_inbound_events_staging",
1372+
keyvalues={"room_id": room_id},
1373+
iterable=to_delete,
1374+
column="event_id",
1375+
desc="prune_staged_events_in_room_delete",
1376+
)
1377+
1378+
return True
1379+
12801380
async def get_all_rooms_with_staged_incoming_events(self) -> List[str]:
12811381
"""Get the room IDs of all events currently staged."""
12821382
return await self.db_pool.simple_select_onecol(

tests/storage/test_event_federation.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
import attr
1616
from parameterized import parameterized
1717

18+
from synapse.api.room_versions import RoomVersions
1819
from synapse.events import _EventInternalMetadata
20+
from synapse.util import json_encoder
1921

2022
import tests.unittest
2123
import tests.utils
@@ -504,6 +506,61 @@ def insert_event(txn):
504506
)
505507
self.assertSetEqual(difference, set())
506508

509+
def test_prune_inbound_federation_queue(self):
510+
"Test that pruning of inbound federation queues work"
511+
512+
room_id = "some_room_id"
513+
514+
# Insert a bunch of events that all reference the previous one.
515+
self.get_success(
516+
self.store.db_pool.simple_insert_many(
517+
table="federation_inbound_events_staging",
518+
values=[
519+
{
520+
"origin": "some_origin",
521+
"room_id": room_id,
522+
"received_ts": 0,
523+
"event_id": f"$fake_event_id_{i + 1}",
524+
"event_json": json_encoder.encode(
525+
{"prev_events": [f"$fake_event_id_{i}"]}
526+
),
527+
"internal_metadata": "{}",
528+
}
529+
for i in range(500)
530+
],
531+
desc="test_prune_inbound_federation_queue",
532+
)
533+
)
534+
535+
# Calling prune once should return True, i.e. a prune happen. The second
536+
# time it shouldn't.
537+
pruned = self.get_success(
538+
self.store.prune_staged_events_in_room(room_id, RoomVersions.V6)
539+
)
540+
self.assertTrue(pruned)
541+
542+
pruned = self.get_success(
543+
self.store.prune_staged_events_in_room(room_id, RoomVersions.V6)
544+
)
545+
self.assertFalse(pruned)
546+
547+
# Assert that we only have a single event left in the queue, and that it
548+
# is the last one.
549+
count = self.get_success(
550+
self.store.db_pool.simple_select_one_onecol(
551+
table="federation_inbound_events_staging",
552+
keyvalues={"room_id": room_id},
553+
retcol="COALESCE(COUNT(*), 0)",
554+
desc="test_prune_inbound_federation_queue",
555+
)
556+
)
557+
self.assertEqual(count, 1)
558+
559+
_, event_id = self.get_success(
560+
self.store.get_next_staged_event_id_for_room(room_id)
561+
)
562+
self.assertEqual(event_id, "$fake_event_id_500")
563+
507564

508565
@attr.s
509566
class FakeEvent:

0 commit comments

Comments
 (0)