Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 82c1ee1

Browse files
authored
Add experimental support for sharding event persister. (#8170)
This is *not* ready for production yet. Caveats: 1. We should write some tests... 2. The stream token that we use for events can get stalled at the minimum position of all writers. This means that new events may not be processed and e.g. sent down sync streams if a writer isn't writing or is slow.
1 parent b257c78 commit 82c1ee1

File tree

18 files changed

+206
-77
lines changed

18 files changed

+206
-77
lines changed

changelog.d/8170.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add experimental support for sharding event persister.

synapse/config/_base.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -832,11 +832,26 @@ class ShardedWorkerHandlingConfig:
832832
def should_handle(self, instance_name: str, key: str) -> bool:
833833
"""Whether this instance is responsible for handling the given key.
834834
"""
835-
836-
# If multiple instances are not defined we always return true.
835+
# If multiple instances are not defined we always return true
837836
if not self.instances or len(self.instances) == 1:
838837
return True
839838

839+
return self.get_instance(key) == instance_name
840+
841+
def get_instance(self, key: str) -> str:
842+
"""Get the instance responsible for handling the given key.
843+
844+
Note: For things like federation sending the config for which instance
845+
is sending is known only to the sender instance if there is only one.
846+
Therefore `should_handle` should be used where possible.
847+
"""
848+
849+
if not self.instances:
850+
return "master"
851+
852+
if len(self.instances) == 1:
853+
return self.instances[0]
854+
840855
# We shard by taking the hash, modulo it by the number of instances and
841856
# then checking whether this instance matches the instance at that
842857
# index.
@@ -846,7 +861,7 @@ def should_handle(self, instance_name: str, key: str) -> bool:
846861
dest_hash = sha256(key.encode("utf8")).digest()
847862
dest_int = int.from_bytes(dest_hash, byteorder="little")
848863
remainder = dest_int % (len(self.instances))
849-
return self.instances[remainder] == instance_name
864+
return self.instances[remainder]
850865

851866

852867
__all__ = ["Config", "RootConfig", "ShardedWorkerHandlingConfig"]

synapse/config/_base.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,4 @@ class ShardedWorkerHandlingConfig:
142142
instances: List[str]
143143
def __init__(self, instances: List[str]) -> None: ...
144144
def should_handle(self, instance_name: str, key: str) -> bool: ...
145+
def get_instance(self, key: str) -> str: ...

synapse/config/workers.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,24 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
from typing import List, Union
17+
1618
import attr
1719

1820
from ._base import Config, ConfigError, ShardedWorkerHandlingConfig
1921
from .server import ListenerConfig, parse_listener_def
2022

2123

24+
def _instance_to_list_converter(obj: Union[str, List[str]]) -> List[str]:
25+
"""Helper for allowing parsing a string or list of strings to a config
26+
option expecting a list of strings.
27+
"""
28+
29+
if isinstance(obj, str):
30+
return [obj]
31+
return obj
32+
33+
2234
@attr.s
2335
class InstanceLocationConfig:
2436
"""The host and port to talk to an instance via HTTP replication.
@@ -33,11 +45,13 @@ class WriterLocations:
3345
"""Specifies the instances that write various streams.
3446
3547
Attributes:
36-
events: The instance that writes to the event and backfill streams.
37-
events: The instance that writes to the typing stream.
48+
events: The instances that write to the event and backfill streams.
49+
typing: The instance that writes to the typing stream.
3850
"""
3951

40-
events = attr.ib(default="master", type=str)
52+
events = attr.ib(
53+
default=["master"], type=List[str], converter=_instance_to_list_converter
54+
)
4155
typing = attr.ib(default="master", type=str)
4256

4357

@@ -105,15 +119,18 @@ def read_config(self, config, **kwargs):
105119
writers = config.get("stream_writers") or {}
106120
self.writers = WriterLocations(**writers)
107121

108-
# Check that the configured writer for events and typing also appears in
122+
# Check that the configured writers for events and typing also appears in
109123
# `instance_map`.
110124
for stream in ("events", "typing"):
111-
instance = getattr(self.writers, stream)
112-
if instance != "master" and instance not in self.instance_map:
113-
raise ConfigError(
114-
"Instance %r is configured to write %s but does not appear in `instance_map` config."
115-
% (instance, stream)
116-
)
125+
instances = _instance_to_list_converter(getattr(self.writers, stream))
126+
for instance in instances:
127+
if instance != "master" and instance not in self.instance_map:
128+
raise ConfigError(
129+
"Instance %r is configured to write %s but does not appear in `instance_map` config."
130+
% (instance, stream)
131+
)
132+
133+
self.events_shard_config = ShardedWorkerHandlingConfig(self.writers.events)
117134

118135
def generate_config_section(self, config_dir_path, server_name, **kwargs):
119136
return """\

synapse/handlers/federation.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,8 @@ async def backfill(self, dest, room_id, limit, extremities):
923923
)
924924
)
925925

926-
await self._handle_new_events(dest, ev_infos, backfilled=True)
926+
if ev_infos:
927+
await self._handle_new_events(dest, room_id, ev_infos, backfilled=True)
927928

928929
# Step 2: Persist the rest of the events in the chunk one by one
929930
events.sort(key=lambda e: e.depth)
@@ -1216,7 +1217,7 @@ async def get_event(event_id: str):
12161217
event_infos.append(_NewEventInfo(event, None, auth))
12171218

12181219
await self._handle_new_events(
1219-
destination, event_infos,
1220+
destination, room_id, event_infos,
12201221
)
12211222

12221223
def _sanity_check_event(self, ev):
@@ -1363,15 +1364,15 @@ async def do_invite_join(
13631364
)
13641365

13651366
max_stream_id = await self._persist_auth_tree(
1366-
origin, auth_chain, state, event, room_version_obj
1367+
origin, room_id, auth_chain, state, event, room_version_obj
13671368
)
13681369

13691370
# We wait here until this instance has seen the events come down
13701371
# replication (if we're using replication) as the below uses caches.
1371-
#
1372-
# TODO: Currently the events stream is written to from master
13731372
await self._replication.wait_for_stream_position(
1374-
self.config.worker.writers.events, "events", max_stream_id
1373+
self.config.worker.events_shard_config.get_instance(room_id),
1374+
"events",
1375+
max_stream_id,
13751376
)
13761377

13771378
# Check whether this room is the result of an upgrade of a room we already know
@@ -1625,7 +1626,7 @@ async def on_invite_request(
16251626
)
16261627

16271628
context = await self.state_handler.compute_event_context(event)
1628-
await self.persist_events_and_notify([(event, context)])
1629+
await self.persist_events_and_notify(event.room_id, [(event, context)])
16291630

16301631
return event
16311632

@@ -1652,7 +1653,9 @@ async def do_remotely_reject_invite(
16521653
await self.federation_client.send_leave(host_list, event)
16531654

16541655
context = await self.state_handler.compute_event_context(event)
1655-
stream_id = await self.persist_events_and_notify([(event, context)])
1656+
stream_id = await self.persist_events_and_notify(
1657+
event.room_id, [(event, context)]
1658+
)
16561659

16571660
return event, stream_id
16581661

@@ -1900,7 +1903,7 @@ async def _handle_new_event(
19001903
)
19011904

19021905
await self.persist_events_and_notify(
1903-
[(event, context)], backfilled=backfilled
1906+
event.room_id, [(event, context)], backfilled=backfilled
19041907
)
19051908
except Exception:
19061909
run_in_background(
@@ -1913,6 +1916,7 @@ async def _handle_new_event(
19131916
async def _handle_new_events(
19141917
self,
19151918
origin: str,
1919+
room_id: str,
19161920
event_infos: Iterable[_NewEventInfo],
19171921
backfilled: bool = False,
19181922
) -> None:
@@ -1944,6 +1948,7 @@ async def prep(ev_info: _NewEventInfo):
19441948
)
19451949

19461950
await self.persist_events_and_notify(
1951+
room_id,
19471952
[
19481953
(ev_info.event, context)
19491954
for ev_info, context in zip(event_infos, contexts)
@@ -1954,6 +1959,7 @@ async def prep(ev_info: _NewEventInfo):
19541959
async def _persist_auth_tree(
19551960
self,
19561961
origin: str,
1962+
room_id: str,
19571963
auth_events: List[EventBase],
19581964
state: List[EventBase],
19591965
event: EventBase,
@@ -1968,6 +1974,7 @@ async def _persist_auth_tree(
19681974
19691975
Args:
19701976
origin: Where the events came from
1977+
room_id,
19711978
auth_events
19721979
state
19731980
event
@@ -2042,17 +2049,20 @@ async def _persist_auth_tree(
20422049
events_to_context[e.event_id].rejected = RejectedReason.AUTH_ERROR
20432050

20442051
await self.persist_events_and_notify(
2052+
room_id,
20452053
[
20462054
(e, events_to_context[e.event_id])
20472055
for e in itertools.chain(auth_events, state)
2048-
]
2056+
],
20492057
)
20502058

20512059
new_event_context = await self.state_handler.compute_event_context(
20522060
event, old_state=state
20532061
)
20542062

2055-
return await self.persist_events_and_notify([(event, new_event_context)])
2063+
return await self.persist_events_and_notify(
2064+
room_id, [(event, new_event_context)]
2065+
)
20562066

20572067
async def _prep_event(
20582068
self,
@@ -2903,21 +2913,27 @@ async def _check_key_revocation(self, public_key, url):
29032913

29042914
async def persist_events_and_notify(
29052915
self,
2916+
room_id: str,
29062917
event_and_contexts: Sequence[Tuple[EventBase, EventContext]],
29072918
backfilled: bool = False,
29082919
) -> int:
29092920
"""Persists events and tells the notifier/pushers about them, if
29102921
necessary.
29112922
29122923
Args:
2913-
event_and_contexts:
2924+
room_id: The room ID of events being persisted.
2925+
event_and_contexts: Sequence of events with their associated
2926+
context that should be persisted. All events must belong to
2927+
the same room.
29142928
backfilled: Whether these events are a result of
29152929
backfilling or not
29162930
"""
2917-
if self.config.worker.writers.events != self._instance_name:
2931+
instance = self.config.worker.events_shard_config.get_instance(room_id)
2932+
if instance != self._instance_name:
29182933
result = await self._send_events(
2919-
instance_name=self.config.worker.writers.events,
2934+
instance_name=instance,
29202935
store=self.store,
2936+
room_id=room_id,
29212937
event_and_contexts=event_and_contexts,
29222938
backfilled=backfilled,
29232939
)

synapse/handlers/message.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -376,9 +376,8 @@ def __init__(self, hs: "HomeServer"):
376376
self.notifier = hs.get_notifier()
377377
self.config = hs.config
378378
self.require_membership_for_aliases = hs.config.require_membership_for_aliases
379-
self._is_event_writer = (
380-
self.config.worker.writers.events == hs.get_instance_name()
381-
)
379+
self._events_shard_config = self.config.worker.events_shard_config
380+
self._instance_name = hs.get_instance_name()
382381

383382
self.room_invite_state_types = self.hs.config.room_invite_state_types
384383

@@ -904,9 +903,10 @@ async def handle_new_client_event(
904903

905904
try:
906905
# If we're a worker we need to hit out to the master.
907-
if not self._is_event_writer:
906+
writer_instance = self._events_shard_config.get_instance(event.room_id)
907+
if writer_instance != self._instance_name:
908908
result = await self.send_event(
909-
instance_name=self.config.worker.writers.events,
909+
instance_name=writer_instance,
910910
event_id=event.event_id,
911911
store=self.store,
912912
requester=requester,
@@ -974,7 +974,9 @@ async def persist_and_notify_client_event(
974974
975975
This should only be run on the instance in charge of persisting events.
976976
"""
977-
assert self._is_event_writer
977+
assert self._events_shard_config.should_handle(
978+
self._instance_name, event.room_id
979+
)
978980

979981
if ratelimit:
980982
# We check if this is a room admin redacting an event so that we

synapse/handlers/room.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,9 @@ async def create_room(
804804

805805
# Always wait for room creation to progate before returning
806806
await self._replication.wait_for_stream_position(
807-
self.hs.config.worker.writers.events, "events", last_stream_id
807+
self.hs.config.worker.events_shard_config.get_instance(room_id),
808+
"events",
809+
last_stream_id,
808810
)
809811

810812
return result, last_stream_id
@@ -1260,10 +1262,10 @@ async def shutdown_room(
12601262
# We now wait for the create room to come back in via replication so
12611263
# that we can assume that all the joins/invites have propogated before
12621264
# we try and auto join below.
1263-
#
1264-
# TODO: Currently the events stream is written to from master
12651265
await self._replication.wait_for_stream_position(
1266-
self.hs.config.worker.writers.events, "events", stream_id
1266+
self.hs.config.worker.events_shard_config.get_instance(new_room_id),
1267+
"events",
1268+
stream_id,
12671269
)
12681270
else:
12691271
new_room_id = None
@@ -1293,7 +1295,9 @@ async def shutdown_room(
12931295

12941296
# Wait for leave to come in over replication before trying to forget.
12951297
await self._replication.wait_for_stream_position(
1296-
self.hs.config.worker.writers.events, "events", stream_id
1298+
self.hs.config.worker.events_shard_config.get_instance(room_id),
1299+
"events",
1300+
stream_id,
12971301
)
12981302

12991303
await self.room_member_handler.forget(target_requester.user, room_id)

synapse/handlers/room_member.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,6 @@ def __init__(self, hs: "HomeServer"):
8282
self._enable_lookup = hs.config.enable_3pid_lookup
8383
self.allow_per_room_profiles = self.config.allow_per_room_profiles
8484

85-
self._event_stream_writer_instance = hs.config.worker.writers.events
86-
self._is_on_event_persistence_instance = (
87-
self._event_stream_writer_instance == hs.get_instance_name()
88-
)
89-
if self._is_on_event_persistence_instance:
90-
self.persist_event_storage = hs.get_storage().persistence
91-
9285
self._join_rate_limiter_local = Ratelimiter(
9386
clock=self.clock,
9487
rate_hz=hs.config.ratelimiting.rc_joins_local.per_second,

synapse/replication/http/federation.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,11 @@ def __init__(self, hs):
6565
self.federation_handler = hs.get_handlers().federation_handler
6666

6767
@staticmethod
68-
async def _serialize_payload(store, event_and_contexts, backfilled):
68+
async def _serialize_payload(store, room_id, event_and_contexts, backfilled):
6969
"""
7070
Args:
7171
store
72+
room_id (str)
7273
event_and_contexts (list[tuple[FrozenEvent, EventContext]])
7374
backfilled (bool): Whether or not the events are the result of
7475
backfilling
@@ -88,14 +89,19 @@ async def _serialize_payload(store, event_and_contexts, backfilled):
8889
}
8990
)
9091

91-
payload = {"events": event_payloads, "backfilled": backfilled}
92+
payload = {
93+
"events": event_payloads,
94+
"backfilled": backfilled,
95+
"room_id": room_id,
96+
}
9297

9398
return payload
9499

95100
async def _handle_request(self, request):
96101
with Measure(self.clock, "repl_fed_send_events_parse"):
97102
content = parse_json_object_from_request(request)
98103

104+
room_id = content["room_id"]
99105
backfilled = content["backfilled"]
100106

101107
event_payloads = content["events"]
@@ -120,7 +126,7 @@ async def _handle_request(self, request):
120126
logger.info("Got %d events from federation", len(event_and_contexts))
121127

122128
max_stream_id = await self.federation_handler.persist_events_and_notify(
123-
event_and_contexts, backfilled
129+
room_id, event_and_contexts, backfilled
124130
)
125131

126132
return 200, {"max_stream_id": max_stream_id}

0 commit comments

Comments
 (0)