Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit b83e822

Browse files
David RobertsonMadLittleModsreivilibresquahtxbabolivier
authored
Stop user directory from failing if it encounters users not in the users table. (#11053)
The following scenarios would halt the user directory updater: - user joins room - user leaves room - user present in room which switches from private to public, or vice versa. for two classes of users: - appservice senders - users missing from the user table. If this happened, the user directory would be stuck, unable to make forward progress. Exclude both cases from the user directory, so that we ignore them. Co-authored-by: Eric Eastwood <[email protected]> Co-authored-by: reivilibre <[email protected]> Co-authored-by: Sean Quah <[email protected]> Co-authored-by: Brendan Abolivier <[email protected]>
1 parent 1db9282 commit b83e822

File tree

13 files changed

+921
-93
lines changed

13 files changed

+921
-93
lines changed

changelog.d/10825.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add an 'approximate difference' method to `StateFilter`.

changelog.d/10970.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix inconsistent behavior of `get_last_client_by_ip` when reporting data that has not been stored in the database yet.

changelog.d/10996.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a bug introduced in Synapse 1.21.0 that causes opentracing and Prometheus metrics for replication requests to be measured incorrectly.

changelog.d/11053.bugfix

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix a bug introduced in Synapse 1.45.0rc1 where the user directory would stop updating if it processed an event from a
2+
user not in the `users` table.

synapse/logging/opentracing.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,14 @@ def err_back(result):
807807
result.addCallbacks(call_back, err_back)
808808

809809
else:
810+
if inspect.isawaitable(result):
811+
logger.error(
812+
"@trace may not have wrapped %s correctly! "
813+
"The function is not async but returned a %s.",
814+
func.__qualname__,
815+
type(result).__name__,
816+
)
817+
810818
scope.__exit__(None, None, None)
811819

812820
return result

synapse/replication/http/_base.py

Lines changed: 78 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -182,85 +182,87 @@ def make_client(cls, hs):
182182
)
183183

184184
@trace(opname="outgoing_replication_request")
185-
@outgoing_gauge.track_inprogress()
186185
async def send_request(*, instance_name="master", **kwargs):
187-
if instance_name == local_instance_name:
188-
raise Exception("Trying to send HTTP request to self")
189-
if instance_name == "master":
190-
host = master_host
191-
port = master_port
192-
elif instance_name in instance_map:
193-
host = instance_map[instance_name].host
194-
port = instance_map[instance_name].port
195-
else:
196-
raise Exception(
197-
"Instance %r not in 'instance_map' config" % (instance_name,)
186+
with outgoing_gauge.track_inprogress():
187+
if instance_name == local_instance_name:
188+
raise Exception("Trying to send HTTP request to self")
189+
if instance_name == "master":
190+
host = master_host
191+
port = master_port
192+
elif instance_name in instance_map:
193+
host = instance_map[instance_name].host
194+
port = instance_map[instance_name].port
195+
else:
196+
raise Exception(
197+
"Instance %r not in 'instance_map' config" % (instance_name,)
198+
)
199+
200+
data = await cls._serialize_payload(**kwargs)
201+
202+
url_args = [
203+
urllib.parse.quote(kwargs[name], safe="") for name in cls.PATH_ARGS
204+
]
205+
206+
if cls.CACHE:
207+
txn_id = random_string(10)
208+
url_args.append(txn_id)
209+
210+
if cls.METHOD == "POST":
211+
request_func = client.post_json_get_json
212+
elif cls.METHOD == "PUT":
213+
request_func = client.put_json
214+
elif cls.METHOD == "GET":
215+
request_func = client.get_json
216+
else:
217+
# We have already asserted in the constructor that a
218+
# compatible was picked, but lets be paranoid.
219+
raise Exception(
220+
"Unknown METHOD on %s replication endpoint" % (cls.NAME,)
221+
)
222+
223+
uri = "http://%s:%s/_synapse/replication/%s/%s" % (
224+
host,
225+
port,
226+
cls.NAME,
227+
"/".join(url_args),
198228
)
199229

200-
data = await cls._serialize_payload(**kwargs)
201-
202-
url_args = [
203-
urllib.parse.quote(kwargs[name], safe="") for name in cls.PATH_ARGS
204-
]
205-
206-
if cls.CACHE:
207-
txn_id = random_string(10)
208-
url_args.append(txn_id)
209-
210-
if cls.METHOD == "POST":
211-
request_func = client.post_json_get_json
212-
elif cls.METHOD == "PUT":
213-
request_func = client.put_json
214-
elif cls.METHOD == "GET":
215-
request_func = client.get_json
216-
else:
217-
# We have already asserted in the constructor that a
218-
# compatible was picked, but lets be paranoid.
219-
raise Exception(
220-
"Unknown METHOD on %s replication endpoint" % (cls.NAME,)
221-
)
222-
223-
uri = "http://%s:%s/_synapse/replication/%s/%s" % (
224-
host,
225-
port,
226-
cls.NAME,
227-
"/".join(url_args),
228-
)
229-
230-
try:
231-
# We keep retrying the same request for timeouts. This is so that we
232-
# have a good idea that the request has either succeeded or failed on
233-
# the master, and so whether we should clean up or not.
234-
while True:
235-
headers: Dict[bytes, List[bytes]] = {}
236-
# Add an authorization header, if configured.
237-
if replication_secret:
238-
headers[b"Authorization"] = [b"Bearer " + replication_secret]
239-
opentracing.inject_header_dict(headers, check_destination=False)
240-
try:
241-
result = await request_func(uri, data, headers=headers)
242-
break
243-
except RequestTimedOutError:
244-
if not cls.RETRY_ON_TIMEOUT:
245-
raise
246-
247-
logger.warning("%s request timed out; retrying", cls.NAME)
248-
249-
# If we timed out we probably don't need to worry about backing
250-
# off too much, but lets just wait a little anyway.
251-
await clock.sleep(1)
252-
except HttpResponseException as e:
253-
# We convert to SynapseError as we know that it was a SynapseError
254-
# on the main process that we should send to the client. (And
255-
# importantly, not stack traces everywhere)
256-
_outgoing_request_counter.labels(cls.NAME, e.code).inc()
257-
raise e.to_synapse_error()
258-
except Exception as e:
259-
_outgoing_request_counter.labels(cls.NAME, "ERR").inc()
260-
raise SynapseError(502, "Failed to talk to main process") from e
261-
262-
_outgoing_request_counter.labels(cls.NAME, 200).inc()
263-
return result
230+
try:
231+
# We keep retrying the same request for timeouts. This is so that we
232+
# have a good idea that the request has either succeeded or failed
233+
# on the master, and so whether we should clean up or not.
234+
while True:
235+
headers: Dict[bytes, List[bytes]] = {}
236+
# Add an authorization header, if configured.
237+
if replication_secret:
238+
headers[b"Authorization"] = [
239+
b"Bearer " + replication_secret
240+
]
241+
opentracing.inject_header_dict(headers, check_destination=False)
242+
try:
243+
result = await request_func(uri, data, headers=headers)
244+
break
245+
except RequestTimedOutError:
246+
if not cls.RETRY_ON_TIMEOUT:
247+
raise
248+
249+
logger.warning("%s request timed out; retrying", cls.NAME)
250+
251+
# If we timed out we probably don't need to worry about backing
252+
# off too much, but lets just wait a little anyway.
253+
await clock.sleep(1)
254+
except HttpResponseException as e:
255+
# We convert to SynapseError as we know that it was a SynapseError
256+
# on the main process that we should send to the client. (And
257+
# importantly, not stack traces everywhere)
258+
_outgoing_request_counter.labels(cls.NAME, e.code).inc()
259+
raise e.to_synapse_error()
260+
except Exception as e:
261+
_outgoing_request_counter.labels(cls.NAME, "ERR").inc()
262+
raise SynapseError(502, "Failed to talk to main process") from e
263+
264+
_outgoing_request_counter.labels(cls.NAME, 200).inc()
265+
return result
264266

265267
return send_request
266268

synapse/storage/databases/main/client_ips.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -538,15 +538,20 @@ async def get_last_client_ip_by_device(
538538
"""
539539
ret = await super().get_last_client_ip_by_device(user_id, device_id)
540540

541-
# Update what is retrieved from the database with data which is pending insertion.
541+
# Update what is retrieved from the database with data which is pending
542+
# insertion, as if it has already been stored in the database.
542543
for key in self._batch_row_update:
543-
uid, access_token, ip = key
544+
uid, _access_token, ip = key
544545
if uid == user_id:
545546
user_agent, did, last_seen = self._batch_row_update[key]
547+
548+
if did is None:
549+
# These updates don't make it to the `devices` table
550+
continue
551+
546552
if not device_id or did == device_id:
547-
ret[(user_id, device_id)] = {
553+
ret[(user_id, did)] = {
548554
"user_id": user_id,
549-
"access_token": access_token,
550555
"ip": ip,
551556
"user_agent": user_agent,
552557
"device_id": did,

synapse/storage/databases/main/user_directory.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
cast,
2727
)
2828

29+
from synapse.api.errors import StoreError
30+
2931
if TYPE_CHECKING:
3032
from synapse.server import HomeServer
3133

@@ -383,7 +385,19 @@ async def should_include_local_user_in_dir(self, user: str) -> bool:
383385
"""Certain classes of local user are omitted from the user directory.
384386
Is this user one of them?
385387
"""
386-
# App service users aren't usually contactable, so exclude them.
388+
# We're opting to exclude the appservice sender (user defined by the
389+
# `sender_localpart` in the appservice registration) even though
390+
# technically it could be DM-able. In the future, this could potentially
391+
# be configurable per-appservice whether the appservice sender can be
392+
# contacted.
393+
if self.get_app_service_by_user_id(user) is not None:
394+
return False
395+
396+
# We're opting to exclude appservice users (anyone matching the user
397+
# namespace regex in the appservice registration) even though technically
398+
# they could be DM-able. In the future, this could potentially
399+
# be configurable per-appservice whether the appservice users can be
400+
# contacted.
387401
if self.get_if_app_services_interested_in_user(user):
388402
# TODO we might want to make this configurable for each app service
389403
return False
@@ -393,8 +407,14 @@ async def should_include_local_user_in_dir(self, user: str) -> bool:
393407
return False
394408

395409
# Deactivated users aren't contactable, so should not appear in the user directory.
396-
if await self.get_user_deactivated_status(user):
410+
try:
411+
if await self.get_user_deactivated_status(user):
412+
return False
413+
except StoreError:
414+
# No such user in the users table. No need to do this when calling
415+
# is_support_user---that returns False if the user is missing.
397416
return False
417+
398418
return True
399419

400420
async def is_room_world_readable_or_publicly_joinable(self, room_id: str) -> bool:

0 commit comments

Comments
 (0)