Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 335f52d

Browse files
authored
Improve handling of non-ASCII characters in user directory search (#15143)
* Fix a long-standing bug where non-ASCII characters in search terms, including accented letters, would not match characters in a different case. * Fix a long-standing bug where search terms using combining accents would not match display names using precomposed accents and vice versa. To fully take effect, the user directory must be rebuilt after this change. Fixes #14630. Signed-off-by: Sean Quah <[email protected]>
1 parent 682151a commit 335f52d

File tree

3 files changed

+184
-2
lines changed

3 files changed

+184
-2
lines changed

changelog.d/15143.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a long-standing bug where the user directory search was not case-insensitive for accented characters.

synapse/storage/databases/main/user_directory.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import logging
1616
import re
17+
import unicodedata
1718
from typing import (
1819
TYPE_CHECKING,
1920
Iterable,
@@ -490,6 +491,11 @@ def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None:
490491
values={"display_name": display_name, "avatar_url": avatar_url},
491492
)
492493

494+
# The display name that goes into the database index.
495+
index_display_name = display_name
496+
if index_display_name is not None:
497+
index_display_name = _filter_text_for_index(index_display_name)
498+
493499
if isinstance(self.database_engine, PostgresEngine):
494500
# We weight the localpart most highly, then display name and finally
495501
# server name
@@ -507,11 +513,15 @@ def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None:
507513
user_id,
508514
get_localpart_from_id(user_id),
509515
get_domain_from_id(user_id),
510-
display_name,
516+
index_display_name,
511517
),
512518
)
513519
elif isinstance(self.database_engine, Sqlite3Engine):
514-
value = "%s %s" % (user_id, display_name) if display_name else user_id
520+
value = (
521+
"%s %s" % (user_id, index_display_name)
522+
if index_display_name
523+
else user_id
524+
)
515525
self.db_pool.simple_upsert_txn(
516526
txn,
517527
table="user_directory_search",
@@ -896,6 +906,41 @@ async def search_user_dir(
896906
return {"limited": limited, "results": results[0:limit]}
897907

898908

909+
def _filter_text_for_index(text: str) -> str:
910+
"""Transforms text before it is inserted into the user directory index, or searched
911+
for in the user directory index.
912+
913+
Note that the user directory search table needs to be rebuilt whenever this function
914+
changes.
915+
"""
916+
# Lowercase the text, to make searches case-insensitive.
917+
# This is necessary for both PostgreSQL and SQLite. PostgreSQL's
918+
# `to_tsquery/to_tsvector` functions don't lowercase non-ASCII characters when using
919+
# the "C" collation, while SQLite just doesn't lowercase non-ASCII characters at
920+
# all.
921+
text = text.lower()
922+
923+
# Normalize the text. NFKC normalization has two effects:
924+
# 1. It canonicalizes the text, ie. maps all visually identical strings to the same
925+
# string. For example, ["e", "◌́"] is mapped to ["é"].
926+
# 2. It maps strings that are roughly equivalent to the same string.
927+
# For example, ["dž"] is mapped to ["d", "ž"], ["①"] to ["1"] and ["i⁹"] to
928+
# ["i", "9"].
929+
text = unicodedata.normalize("NFKC", text)
930+
931+
# Note that nothing is done to make searches accent-insensitive.
932+
# That could be achieved by converting to NFKD form instead (with combining accents
933+
# split out) and filtering out combining accents using `unicodedata.combining(c)`.
934+
# The downside of this may be noisier search results, since search terms with
935+
# explicit accents will match characters with no accents, or completely different
936+
# accents.
937+
#
938+
# text = unicodedata.normalize("NFKD", text)
939+
# text = "".join([c for c in text if not unicodedata.combining(c)])
940+
941+
return text
942+
943+
899944
def _parse_query_sqlite(search_term: str) -> str:
900945
"""Takes a plain unicode string from the user and converts it into a form
901946
that can be passed to database.
@@ -905,6 +950,7 @@ def _parse_query_sqlite(search_term: str) -> str:
905950
We specifically add both a prefix and non prefix matching term so that
906951
exact matches get ranked higher.
907952
"""
953+
search_term = _filter_text_for_index(search_term)
908954

909955
# Pull out the individual words, discarding any non-word characters.
910956
results = _parse_words(search_term)
@@ -917,6 +963,8 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
917963
We use this so that we can add prefix matching, which isn't something
918964
that is supported by default.
919965
"""
966+
search_term = _filter_text_for_index(search_term)
967+
920968
escaped_words = []
921969
for word in _parse_words(search_term):
922970
# Postgres tsvector and tsquery quoting rules:

tests/storage/test_user_directory.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,139 @@ def test_search_user_dir_start_of_user_id(self) -> None:
504504
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
505505
)
506506

507+
@override_config({"user_directory": {"search_all_users": True}})
508+
def test_search_user_dir_ascii_case_insensitivity(self) -> None:
509+
"""Tests that a user can look up another user by searching for their name in a
510+
different case.
511+
"""
512+
CHARLIE = "@someuser:example.org"
513+
self.get_success(
514+
self.store.update_profile_in_user_dir(CHARLIE, "Charlie", None)
515+
)
516+
517+
r = self.get_success(self.store.search_user_dir(ALICE, "cHARLIE", 10))
518+
self.assertFalse(r["limited"])
519+
self.assertEqual(1, len(r["results"]))
520+
self.assertDictEqual(
521+
r["results"][0],
522+
{"user_id": CHARLIE, "display_name": "Charlie", "avatar_url": None},
523+
)
524+
525+
@override_config({"user_directory": {"search_all_users": True}})
526+
def test_search_user_dir_unicode_case_insensitivity(self) -> None:
527+
"""Tests that a user can look up another user by searching for their name in a
528+
different case.
529+
"""
530+
IVAN = "@someuser:example.org"
531+
self.get_success(self.store.update_profile_in_user_dir(IVAN, "Иван", None))
532+
533+
r = self.get_success(self.store.search_user_dir(ALICE, "иВАН", 10))
534+
self.assertFalse(r["limited"])
535+
self.assertEqual(1, len(r["results"]))
536+
self.assertDictEqual(
537+
r["results"][0],
538+
{"user_id": IVAN, "display_name": "Иван", "avatar_url": None},
539+
)
540+
541+
@override_config({"user_directory": {"search_all_users": True}})
542+
def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None:
543+
"""Tests that a user can look up another user by searching for their name in a
544+
different case, when their name contains dotted or dotless "i"s.
545+
546+
Some languages have dotted and dotless versions of "i", which are considered to
547+
be different letters: i <-> İ, ı <-> I. To make things difficult, they reuse the
548+
ASCII "i" and "I" code points, despite having different lowercase / uppercase
549+
forms.
550+
"""
551+
USER = "@someuser:example.org"
552+
553+
expected_matches = [
554+
# (search_term, display_name)
555+
# A search for "i" should match "İ".
556+
("iiiii", "İİİİİ"),
557+
# A search for "I" should match "ı".
558+
("IIIII", "ııııı"),
559+
# A search for "ı" should match "I".
560+
("ııııı", "IIIII"),
561+
# A search for "İ" should match "i".
562+
("İİİİİ", "iiiii"),
563+
]
564+
565+
for search_term, display_name in expected_matches:
566+
self.get_success(
567+
self.store.update_profile_in_user_dir(USER, display_name, None)
568+
)
569+
570+
r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
571+
self.assertFalse(r["limited"])
572+
self.assertEqual(
573+
1,
574+
len(r["results"]),
575+
f"searching for {search_term!r} did not match {display_name!r}",
576+
)
577+
self.assertDictEqual(
578+
r["results"][0],
579+
{"user_id": USER, "display_name": display_name, "avatar_url": None},
580+
)
581+
582+
# We don't test for negative matches, to allow implementations that consider all
583+
# the i variants to be the same.
584+
585+
test_search_user_dir_dotted_dotless_i_case_insensitivity.skip = "not supported" # type: ignore
586+
587+
@override_config({"user_directory": {"search_all_users": True}})
588+
def test_search_user_dir_unicode_normalization(self) -> None:
589+
"""Tests that a user can look up another user by searching for their name with
590+
either composed or decomposed accents.
591+
"""
592+
AMELIE = "@someuser:example.org"
593+
594+
expected_matches = [
595+
# (search_term, display_name)
596+
("Ame\u0301lie", "Amélie"),
597+
("Amélie", "Ame\u0301lie"),
598+
]
599+
600+
for search_term, display_name in expected_matches:
601+
self.get_success(
602+
self.store.update_profile_in_user_dir(AMELIE, display_name, None)
603+
)
604+
605+
r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
606+
self.assertFalse(r["limited"])
607+
self.assertEqual(
608+
1,
609+
len(r["results"]),
610+
f"searching for {search_term!r} did not match {display_name!r}",
611+
)
612+
self.assertDictEqual(
613+
r["results"][0],
614+
{"user_id": AMELIE, "display_name": display_name, "avatar_url": None},
615+
)
616+
617+
@override_config({"user_directory": {"search_all_users": True}})
618+
def test_search_user_dir_accent_insensitivity(self) -> None:
619+
"""Tests that a user can look up another user by searching for their name
620+
without any accents.
621+
"""
622+
AMELIE = "@someuser:example.org"
623+
self.get_success(self.store.update_profile_in_user_dir(AMELIE, "Amélie", None))
624+
625+
r = self.get_success(self.store.search_user_dir(ALICE, "amelie", 10))
626+
self.assertFalse(r["limited"])
627+
self.assertEqual(1, len(r["results"]))
628+
self.assertDictEqual(
629+
r["results"][0],
630+
{"user_id": AMELIE, "display_name": "Amélie", "avatar_url": None},
631+
)
632+
633+
# It may be desirable for "é"s in search terms to not match plain "e"s and we
634+
# really don't want "é"s in search terms to match "e"s with different accents.
635+
# But we don't test for this to allow implementations that consider all
636+
# "e"-lookalikes to be the same.
637+
638+
test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore
639+
507640

508641
class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
509642
use_icu = True

0 commit comments

Comments
 (0)