- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 2.1k
Resync state after partial-state join #12394
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Preparation for faster-room-join work: start a background process to resynchronise the room state after a room join. | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -466,6 +466,8 @@ async def do_invite_join( | |
| ) | ||
|  | ||
| if ret.partial_state: | ||
| # TODO(faster_joins): roll this back if we don't manage to start the | ||
| # background resync (eg process_remote_join fails) | ||
| await self.store.store_partial_state_room(room_id, ret.servers_in_room) | ||
|  | ||
| max_stream_id = await self._federation_event_handler.process_remote_join( | ||
|  | @@ -478,6 +480,18 @@ async def do_invite_join( | |
| partial_state=ret.partial_state, | ||
| ) | ||
|  | ||
| if ret.partial_state: | ||
| # kick off the process of asynchronously fixing up the state for this | ||
| # room | ||
| # | ||
| # TODO(faster_joins): pick this up again on restart | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm guessing we need to persist more info to the database and then ensure that starts up again on restart? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not even sure we ned any extra info in the database - all we need is a list of room ids, and we have that. So yes, it's just a matter of starting it up again on restart. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair! Was mostly just curious, not asking for it to be fixed yet! 👍 | ||
| run_as_background_process( | ||
| desc="sync_partial_state_room", | ||
| func=self._sync_partial_state_room, | ||
| destination=origin, | ||
| room_id=room_id, | ||
| ) | ||
|  | ||
| # We wait here until this instance has seen the events come down | ||
| # replication (if we're using replication) as the below uses caches. | ||
| await self._replication.wait_for_stream_position( | ||
|  | @@ -1370,3 +1384,61 @@ async def get_room_complexity( | |
| # We fell off the bottom, couldn't get the complexity from anyone. Oh | ||
| # well. | ||
| return None | ||
|  | ||
| async def _sync_partial_state_room( | ||
| self, | ||
| destination: str, | ||
| room_id: str, | ||
| ) -> None: | ||
| """Background process to resync the state of a partial-state room | ||
|  | ||
| Args: | ||
| destination: homeserver to pull the state from | ||
| room_id: room to be resynced | ||
| """ | ||
|  | ||
| # TODO(faster_joins): do we need to lock to avoid races? What happens if other | ||
| # worker processes kick off a resync in parallel? Perhaps we should just elect | ||
| # a single worker to do the resync. | ||
| 
      Comment on lines
    
      +1400
     to 
      +1402
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe put this on the background process worker? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, maybe. On the other hand it does a bunch of work which is quite similar to the event creator, and it would be nice to shard it across multiple workers (as we do with event creators), so it's not entirely obvious. Either way, I'm punting the decision for now. | ||
| # | ||
| # TODO(faster_joins): what happens if we leave the room during a resync? if we | ||
| # really leave, that might mean we have difficulty getting the room state over | ||
| # federation. | ||
| 
      Comment on lines
    
      +1404
     to 
      +1406
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe this is fine, but will this work OK if a second user joins from your server while we only have partial room state? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's really a special-case of the more general problem of sending events while we have partial state. Currently, it's completely broken, but in a couple of PRs time, we'll have a thing which will wait for the resync to complete. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense, thanks! 👍 Glad to know this is something in the works already. | ||
| # | ||
| # TODO(faster_joins): try other destinations if the one we have fails | ||
|  | ||
| logger.info("Syncing state for room %s via %s", room_id, destination) | ||
|  | ||
| # we work through the queue in order of increasing stream ordering. | ||
| while True: | ||
| batch = await self.store.get_partial_state_events_batch(room_id) | ||
| if not batch: | ||
| # all the events are updated, so we can update current state and | ||
| # clear the lazy-loading flag. | ||
| logger.info("Updating current state for %s", room_id) | ||
| assert ( | ||
| self.storage.persistence is not None | ||
| ), "TODO(faster_joins): support for workers" | ||
| await self.storage.persistence.update_current_state(room_id) | ||
|  | ||
| logger.info("Clearing partial-state flag for %s", room_id) | ||
| success = await self.store.clear_partial_state_room(room_id) | ||
| if success: | ||
| logger.info("State resync complete for %s", room_id) | ||
|  | ||
| # TODO(faster_joins) update room stats and user directory? | ||
| return | ||
|  | ||
| # we raced against more events arriving with partial state. Go round | ||
| # the loop again. We've already logged a warning, so no need for more. | ||
| 
      Comment on lines
    
      +1432
     to 
      +1433
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a bit confused at where this might come from -- if we've resolved the entire room state how could there be states with partial state arriving? These are from other servers in the room sending us events, not from the process of resolving partial state, correct? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 
 that's right. Currently, when we get a new event over federation, we calculate the state at that event based on the state at the new event's  
 Obviously, there's another failure mode here, where  | ||
| continue | ||
|         
                  richvdh marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
|  | ||
| events = await self.store.get_events_as_list( | ||
| batch, | ||
| redact_behaviour=EventRedactBehaviour.AS_IS, | ||
| allow_rejected=True, | ||
| ) | ||
| for event in events: | ||
| await self._federation_event_handler.update_state_for_partial_state_event( | ||
| destination, event | ||
| ) | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -21,6 +21,7 @@ | |
| from synapse.api.errors import NotFoundError, UnsupportedRoomVersionError | ||
| from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, RoomVersion | ||
| from synapse.events import EventBase | ||
| from synapse.events.snapshot import EventContext | ||
| from synapse.storage._base import SQLBaseStore | ||
| from synapse.storage.database import ( | ||
| DatabasePool, | ||
|  | @@ -354,6 +355,53 @@ async def get_referenced_state_groups( | |
|  | ||
| return {row["state_group"] for row in rows} | ||
|  | ||
| async def update_state_for_partial_state_event( | ||
| self, | ||
| event: EventBase, | ||
| context: EventContext, | ||
| ) -> None: | ||
| """Update the state group for a partial state event""" | ||
| await self.db_pool.runInteraction( | ||
| "update_state_for_partial_state_event", | ||
| self._update_state_for_partial_state_event_txn, | ||
| event, | ||
| context, | ||
| ) | ||
|  | ||
| def _update_state_for_partial_state_event_txn( | ||
| self, | ||
| txn, | ||
| event: EventBase, | ||
| context: EventContext, | ||
| ): | ||
| # we shouldn't have any outliers here | ||
| assert not event.internal_metadata.is_outlier() | ||
|  | ||
| # anything that was rejected should have the same state as its | ||
| # predecessor. | ||
| if context.rejected: | ||
| assert context.state_group == context.state_group_before_event | ||
|  | ||
| self.db_pool.simple_update_txn( | ||
| txn, | ||
| table="event_to_state_groups", | ||
| keyvalues={"event_id": event.event_id}, | ||
| updatevalues={"state_group": context.state_group}, | ||
| ) | ||
|  | ||
| self.db_pool.simple_delete_one_txn( | ||
| txn, | ||
| table="partial_state_events", | ||
| keyvalues={"event_id": event.event_id}, | ||
| ) | ||
| 
      Comment on lines
    
      +385
     to 
      +396
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems unfortunate that these will be called separately per event, but I don't see a good way to make this method take a list of events. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we think about optimising it later. Hopefully there won't be a huge number of events anyway. | ||
|  | ||
| # TODO: need to do something about workers here | ||
|         
                  richvdh marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
| txn.call_after( | ||
| self._get_state_group_for_event.prefill, | ||
| (event.event_id,), | ||
| context.state_group, | ||
| ) | ||
|  | ||
|  | ||
| class MainStateBackgroundUpdateStore(RoomMemberWorkerStore): | ||
|  | ||
|  | ||
Uh oh!
There was an error while loading. Please reload this page.