Fix sharded federation sender sometimes using 100% CPU.

erikjohnston · erikjohnston · commit 3a569fb2000e · 2021-04-08T17:34:07.000+01:00
We pull all destinations requiring catchup from the DB in batches.
However, if all those destinations get filtered out (due to the
federation sender being sharded), then the `last_processed` destination
doesn't get updated, and we keep requesting the same set repeatedly.
diff --git a/changelog.d/9770.bugfix b/changelog.d/9770.bugfix
@@ -0,0 +1 @@
+Fix bug where sharded federation senders could get stuck repeatedly querying the DB in a loop, using lots of CPU.
diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py
@@ -734,16 +734,18 @@ async def _wake_destinations_needing_catchup(self) -> None:
                 self._catchup_after_startup_timer = None
                 break
 
+            last_processed = destinations_to_wake[-1]
+
             destinations_to_wake = [
                 d
                 for d in destinations_to_wake
                 if self._federation_shard_config.should_handle(self._instance_name, d)
             ]
 
-            for last_processed in destinations_to_wake:
+            for destination in destinations_to_wake:
                 logger.info(
                     "Destination %s has outstanding catch-up, waking up.",
                     last_processed,
                 )
-                self.wake_destination(last_processed)
+                self.wake_destination(destination)
                 await self.clock.sleep(CATCH_UP_STARTUP_INTERVAL_SEC)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Fix bug where sharded federation senders could get stuck repeatedly querying the DB in a loop, using lots of CPU.`
Original file line number	Diff line number	Diff line change
`@@ -734,16 +734,18 @@ async def _wake_destinations_needing_catchup(self) -> None:`
`734`	`734`	`self._catchup_after_startup_timer = None`
`735`	`735`	`break`
`736`	`736`
	`737`	`+ last_processed = destinations_to_wake[-1]`
	`738`	`+`
`737`	`739`	`destinations_to_wake = [`
`738`	`740`	`d`
`739`	`741`	`for d in destinations_to_wake`
`740`	`742`	`if self._federation_shard_config.should_handle(self._instance_name, d)`
`741`	`743`	`]`
`742`	`744`
`743`		`- for last_processed in destinations_to_wake:`
	`745`	`+ for destination in destinations_to_wake:`
`744`	`746`	`logger.info(`
`745`	`747`	`"Destination %s has outstanding catch-up, waking up.",`
`746`	`748`	`last_processed,`
`747`	`749`	`)`
`748`		`- self.wake_destination(last_processed)`
	`750`	`+ self.wake_destination(destination)`
`749`	`751`	`await self.clock.sleep(CATCH_UP_STARTUP_INTERVAL_SEC)`