Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion changelog.d/19036.misc
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Move `start_doing_background_updates()` to `SynapseHomeServer.start_background_tasks()`.
Move `start_doing_background_updates()` to `HomeServer.start_background_tasks()`.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the changelog from #19036 to merge with this one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PresenceHandlerInitTestCase failures ❌

PresenceHandlerInitTestCase purposely tries to disable background tasks to keep PresenceHandler uninitialized until the test is prepared. It tries to test that "presence state restored from the database [when PresenceHandler is initialized] should not persist forever".

This conflicts with the changes currently being made where we expect the background tasks to run because we want the database background tasks to complete before we start the tests.

class PresenceHandlerInitTestCase(unittest.HomeserverTestCase):
def default_config(self) -> JsonDict:
config = super().default_config()
# Disable background tasks on this worker so that the PresenceHandler isn't
# loaded until we request it.
config["run_background_tasks_on"] = "other"
return config

Example failures:

SYNAPSE_TEST_LOG_LEVEL=INFO poetry run trial tests.handlers.test_presence.PresenceHandlerInitTestCase
The "poetry.dev-dependencies" section is deprecated and will be removed in a future version. Use "poetry.group.dev.dependencies" instead.
tests.handlers.test_presence
  PresenceHandlerInitTestCase
    test_restored_presence_idles ...                                    [ERROR]
    test_restored_presence_online_after_sync_0_org_matrix_msc3026_busy ... [ERROR]
    test_restored_presence_online_after_sync_1_online ...               [ERROR]
    test_restored_presence_online_after_sync_2_unavailable ...          [ERROR]
    test_restored_presence_online_after_sync_3_offline ...              [ERROR]

===============================================================================
[ERROR]
Traceback (most recent call last):
  File "/home/eric/Documents/github/element/synapse/tests/unittest.py", line 138, in new
    return code(orig, *args, **kwargs)
  File "/home/eric/Documents/github/element/synapse/tests/unittest.py", line 232, in setUp
    return orig()
  File "/home/eric/Documents/github/element/synapse/tests/unittest.py", line 404, in setUp
    self.hs = self.make_homeserver(self.reactor, self.clock)
  File "/home/eric/Documents/github/element/synapse/tests/unittest.py", line 525, in make_homeserver
    hs = self.setup_test_homeserver(reactor=reactor, clock=clock)
  File "/home/eric/Documents/github/element/synapse/tests/unittest.py", line 733, in setup_test_homeserver
    raise AssertionError(
builtins.AssertionError: Timed out waiting for background updates to complete (5s). Did you forget to `start_doing_background_updates()`?

tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_idles
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_0_org_matrix_msc3026_busy
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_1_online
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_2_unavailable
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_3_offline
-------------------------------------------------------------------------------
Ran 5 tests in 25.208s

FAILED (errors=5)

Copy link
Contributor Author

@MadLittleMods MadLittleMods Oct 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Database isn't setup for tests that use GenericWorkerServer

We have a few tests that use homeserver_to_use=GenericWorkerServer and are failing with builtins.AssertionError: Timed out waiting for background updates to complete (5s). Did you forget to 'start_doing_background_updates()'? in CI (from our new assertion).

Error: 
Traceback (most recent call last):
  File "/home/runner/work/synapse/synapse/tests/unittest.py", line 138, in new
    return code(orig, *args, **kwargs)
  File "/home/runner/work/synapse/synapse/tests/unittest.py", line 232, in setUp
    return orig()
  File "/home/runner/work/synapse/synapse/tests/unittest.py", line 404, in setUp
    self.hs = self.make_homeserver(self.reactor, self.clock)
  File "/home/runner/work/synapse/synapse/tests/app/test_openid_listener.py", line 40, in make_homeserver
    hs = self.setup_test_homeserver(homeserver_to_use=GenericWorkerServer)
  File "/home/runner/work/synapse/synapse/tests/unittest.py", line 733, in setup_test_homeserver
    raise AssertionError(
builtins.AssertionError: Timed out waiting for background updates to complete (5s). Did you forget to `start_doing_background_updates()`?

tests.app.test_openid_listener.FederationReaderOpenIDListenerTests.test_openid_listener_0
tests.app.test_openid_listener.FederationReaderOpenIDListenerTests.test_openid_listener_1
tests.app.test_openid_listener.FederationReaderOpenIDListenerTests.test_openid_listener_2
tests.app.test_openid_listener.FederationReaderOpenIDListenerTests.test_openid_listener_3

...

tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_idles
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_0_org_matrix_msc3026_busy
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_1_online
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_2_unavailable
tests.handlers.test_presence.PresenceHandlerInitTestCase.test_restored_presence_online_after_sync_3_offline

...

tests.replication.test_federation_ack.FederationAckTestCase.test_federation_ack_sent

...

tests.storage.test_rollback_worker.WorkerSchemaTests.test_not_upgraded_current_schema_version_with_outstanding_deltas
tests.storage.test_rollback_worker.WorkerSchemaTests.test_not_upgraded_old_schema_version
tests.storage.test_rollback_worker.WorkerSchemaTests.test_rolling_back

Previously, we didn't wait for background updates in these cases and only ran the database background updates for the TestHomeServer (the default in tests). Basically turning a blind eye to this kind of problem.

synapse/tests/unittest.py

Lines 680 to 696 in ec7554b

async def run_bg_updates() -> None:
with LoggingContext(name="run_bg_updates", server_name=server_name):
self.get_success(stor.db_pool.updates.run_background_updates(False))
hs = setup_test_homeserver(
cleanup_func=self.addCleanup,
server_name=server_name,
config=config_obj,
reactor=reactor,
clock=clock,
**extra_homeserver_attributes,
)
stor = hs.get_datastores().main
# Run the database background updates, when running against "master".
if hs.__class__.__name__ == "TestHomeServer":
self.get_success(run_bg_updates())

Unable to run test by itself

Compounding things is that we have to deal with #15671 (previously matrix-org/synapse#15671) which prevents us from running these tests on their own so it's even harder to debug what's going on.

synapse.storage.prepare_database.UpgradeDatabaseException: Uninitialised database: run the main synapse process to prepare the database schema before starting worker processes.

1 change: 1 addition & 0 deletions changelog.d/19057.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Move `start_doing_background_updates()` to `HomeServer.start_background_tasks()`.
5 changes: 0 additions & 5 deletions synapse/app/homeserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,6 @@ def start_listening(self) -> None:
# during parsing
logger.warning("Unrecognized listener type: %s", listener.type)

def start_background_tasks(self) -> None:
super().start_background_tasks()

self.get_datastores().main.db_pool.updates.start_doing_background_updates()


def load_or_generate_config(argv_options: List[str]) -> HomeServerConfig:
"""
Expand Down
1 change: 1 addition & 0 deletions synapse/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@ def start_background_tasks(self) -> None:
self.get_task_scheduler()
self.get_common_usage_metrics_manager().setup()
start_phone_stats_home(self)
self.get_datastores().main.db_pool.updates.start_doing_background_updates()
Copy link
Contributor Author

@MadLittleMods MadLittleMods Oct 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To call this out again; before #19036 (comment), start_doing_background_updates() only ran on the main Synapse instance.

With this change, it runs on whatever instance is configured to run_background_tasks_on.

Based on the previous discussion, we thought this was fine. I just didn't have this in the correct spot to match that description.


def get_reactor(self) -> ISynapseReactor:
"""
Expand Down
4 changes: 2 additions & 2 deletions synapse/util/task_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@
class TaskScheduler:
"""
This is a simple task scheduler designed for resumable tasks. Normally,
you'd use `run_in_background` to start a background task or Twisted's
`deferLater` if you want to run it later.
you'd use `run_in_background` to start a background task or `clock.call_later`
if you want to run it later.

The issue is that these tasks stop completely and won't resume if Synapse is
shut down for any reason.
Expand Down
3 changes: 1 addition & 2 deletions tests/rest/admin/test_room.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@

from parameterized import parameterized

from twisted.internet.task import deferLater
from twisted.internet.testing import MemoryReactor

import synapse.rest.admin
Expand Down Expand Up @@ -861,7 +860,7 @@ def test_delete_same_room_twice(self) -> None:
# Mock PaginationHandler.purge_room to sleep for 100s, so we have time to do a second call
# before the purge is over. Note that it doesn't purge anymore, but we don't care.
async def purge_room(room_id: str, force: bool) -> None:
await deferLater(self.hs.get_reactor(), 100, lambda: None)
Copy link
Contributor Author

@MadLittleMods MadLittleMods Oct 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these deferLater changes have been split off to #19058 so we could merge that first in order to clean up the diff here.

They are changed here because we started seeing these failures in CI with this PR for some reason:

Example failure: https://github.com/element-hq/synapse/actions/runs/18477454390/job/52645183606?pr=19057

builtins.AssertionError: Expected `looping_call` callback from the reactor to start with the sentinel logcontext but saw task-_resumable_task-0-IBzAmHUoepQfLnEA. In other words, another task shouldn't have leaked their logcontext to us.

await self.hs.get_clock().sleep(100)

self.pagination_handler.purge_room = AsyncMock(side_effect=purge_room) # type: ignore[method-assign]

Expand Down
62 changes: 52 additions & 10 deletions tests/unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,12 @@
from synapse.http.site import SynapseRequest, SynapseSite
from synapse.logging.context import (
SENTINEL_CONTEXT,
LoggingContext,
current_context,
set_current_context,
)
from synapse.rest import RegisterServletsFunc
from synapse.server import HomeServer
from synapse.storage.background_updates import UpdaterStatus
from synapse.storage.keys import FetchKeyResult
from synapse.types import ISynapseReactor, JsonDict, Requester, UserID, create_requester
from synapse.util.clock import Clock
Expand Down Expand Up @@ -108,6 +108,11 @@
R = TypeVar("R")
S = TypeVar("S")

BACKGROUND_UPDATE_TIMEOUT_SECONDS = 5
"""
We expect this to be pretty immediate as we're working with an empty database.
"""


class _TypedFailure(Generic[_ExcType], Protocol):
"""Extension to twisted.Failure, where the 'value' has a certain type."""
Expand Down Expand Up @@ -490,7 +495,11 @@ def wait_on_thread(self, deferred: Deferred, timeout: int = 10) -> None:

def wait_for_background_updates(self) -> None:
"""Block until all background database updates have completed."""
store = self.hs.get_datastores().main
self._wait_for_background_updates(self.hs)

def _wait_for_background_updates(self, hs: HomeServer) -> None:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We no longer use this variant but I think it's useful to reference in the comment down below.

"""Block until all background database updates have completed."""
store = hs.get_datastores().main
while not self.get_success(
store.db_pool.updates.has_completed_background_updates()
):
Expand Down Expand Up @@ -677,10 +686,6 @@ def setup_test_homeserver(
# construct a homeserver with a matching name.
server_name = config_obj.server.server_name

async def run_bg_updates() -> None:
with LoggingContext(name="run_bg_updates", server_name=server_name):
self.get_success(stor.db_pool.updates.run_background_updates(False))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One difference to note here is that we're running with run_background_updates(False) which sets argument sleep: False and determines:

sleep: Whether to limit how quickly we run background updates or
not.

Versus now where we run this via start_doing_background_updates() and sleep is configured via hs.config.background_updates.sleep_enabled and default to True. This means that we're potentially running slower in tests now. Will post an update to compare before/after to see if it even matters.

def start_doing_background_updates(self) -> None:
if self.enabled:
# if we start a new background update, not all updates are done.
self._all_done = False
sleep = self.sleep_enabled
self.hs.run_as_background_process(
"background_updates",
self.run_background_updates,
sleep,
)


hs = setup_test_homeserver(
cleanup_func=self.addCleanup,
server_name=server_name,
Expand All @@ -689,11 +694,48 @@ async def run_bg_updates() -> None:
clock=clock,
**extra_homeserver_attributes,
)
stor = hs.get_datastores().main

# Run the database background updates, when running against "master".
if hs.__class__.__name__ == "TestHomeServer":
self.get_success(run_bg_updates())
# Wait for the database background updates to complete. This is important
# because tests assume that the database is using the latest schema.
#
# We could use `self._wait_for_background_updates(hs)` to accomplish the same
# thing but we don't want to start or drive the background updates here. We want
# to ensure the homeserver itself is doing that.
start_time_s = time.time()
store = hs.get_datastores().main
while not self.get_success(
# This check is slightly naive. It only checks if there is anything left in
# the `background_updates` database table so it is possible that the
# homeserver mistakenly never registered any background updates to be run.
# Since `register_background_xxx(...)` is done across the codebase, we can't
# really assert that everything was registered as expected.
store.db_pool.updates.has_completed_background_updates()
):
# Timeout if it takes too long. This should be pretty immediate as we're
# working with an empty database.
current_time_s = time.time()
if current_time_s - start_time_s > BACKGROUND_UPDATE_TIMEOUT_SECONDS:
background_update_status = store.db_pool.updates.get_status()

# Add some better context when we give up
extra_message = ""
if background_update_status == UpdaterStatus.NOT_STARTED:
extra_message = (
"Did you forget to `start_doing_background_updates()`?"
)
elif background_update_status == UpdaterStatus.RUNNING_UPDATE:
extra_message = "Background updates were still running when we gave up. Are they stuck?"
else:
extra_message = (
f"Background update status was {background_update_status}."
)

raise AssertionError(
f"Timed out waiting for background updates to complete ({BACKGROUND_UPDATE_TIMEOUT_SECONDS}s). "
+ extra_message
)

self.pump(by=0.1)

return hs

Expand Down
9 changes: 6 additions & 3 deletions tests/util/test_task_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
#
from typing import List, Optional, Tuple

from twisted.internet.task import deferLater
from twisted.internet.defer import Deferred
from twisted.internet.testing import MemoryReactor

from synapse.logging.context import make_deferred_yieldable
from synapse.server import HomeServer
from synapse.types import JsonMapping, ScheduledTask, TaskStatus
from synapse.util.clock import Clock
Expand Down Expand Up @@ -87,7 +88,7 @@ async def _sleeping_task(
self, task: ScheduledTask
) -> Tuple[TaskStatus, Optional[JsonMapping], Optional[str]]:
# Sleep for a second
await deferLater(self.reactor, 1, lambda: None)
await self.hs.get_clock().sleep(1)
return TaskStatus.COMPLETE, None, None

def test_schedule_lot_of_tasks(self) -> None:
Expand Down Expand Up @@ -170,8 +171,10 @@ async def _resumable_task(
return TaskStatus.COMPLETE, {"success": True}, None
else:
await self.task_scheduler.update_task(task.id, result={"in_progress": True})
# Create a deferred which we will never complete
incomplete_d: Deferred = Deferred()
# Await forever to simulate an aborted task because of a restart
await deferLater(self.reactor, 2**16, lambda: None)
await make_deferred_yieldable(incomplete_d)
# This should never been called
return TaskStatus.ACTIVE, None, None

Expand Down
Loading