autocomplete: Match insensitively to diacritics

chrisbobbe · chrisbobbe · commit 1af7a2534619 · 2025-08-20T10:55:55.000-07:00
Fixes zulip#237. Fixes zulip#1067.
diff --git a/lib/model/autocomplete.dart b/lib/model/autocomplete.dart
@@ -2,6 +2,7 @@ import 'dart:math';
 
 import 'package:flutter/foundation.dart';
 import 'package:flutter/services.dart';
+import 'package:unorm_dart/unorm_dart.dart' as unorm;
 
 import '../api/model/events.dart';
 import '../api/model/model.dart';
@@ -732,12 +733,12 @@ class MentionAutocompleteView extends AutocompleteView<MentionAutocompleteQuery,
 /// An [AutocompleteQuery] object stores the user's actual query string
 /// as [raw].
 /// It may also store processed forms of the query
-/// (for example, converted to lowercase or split on whitespace)
+/// (for example, normalized by case and diacritics, or split on whitespace)
 /// to prepare for whatever particular form of searching will be done
 /// for the given type of autocomplete interaction.
 abstract class AutocompleteQuery {
   AutocompleteQuery(this.raw) {
-    _normalized = raw.toLowerCase();
+    _normalized = lowercaseAndStripDiacritics(raw);
     // TODO(#1805) split on space characters that the user is actually using
     //   (e.g. U+3000 IDEOGRAPHIC SPACE);
     //   could check active keyboard or just split on all kinds of spaces
@@ -751,15 +752,23 @@ abstract class AutocompleteQuery {
 
   late final List<String> _normalizedWords;
 
+  static final RegExp _regExpStripMarkCharacters = RegExp(r'\p{M}', unicode: true);
+
+  static String lowercaseAndStripDiacritics(String input) {
+    // Anders reports that this is what web does; see discussion:
+    //   https://chat.zulip.org/#narrow/channel/48-mobile/topic/deps.3A.20Add.20new.20package.20to.20handle.20diacritics/near/2244487
+    final lowercase = input.toLowerCase();
+    final compatibilityNormalized = unorm.nfkd(lowercase);
+    return compatibilityNormalized.replaceAll(_regExpStripMarkCharacters, '');
+  }
+
   /// Whether all of this query's words have matches in [words],
-  /// modulo case, that appear in order.
+  /// insensitively to case and diacritics, that appear in order.
   ///
   /// A "match" means the word in [words] starts with the query word.
   ///
-  /// [words] must all be lowercased.
+  /// [words] must all have been passed through [lowercaseAndStripDiacritics].
   bool _testContainsQueryWords(List<String> words) {
-    // TODO(#237) test with diacritics stripped, where appropriate,
-    //   and update dartdoc's summary line and its restriction about [words].
     int wordsIndex = 0;
     int queryWordsIndex = 0;
     while (true) {
@@ -828,10 +837,9 @@ class MentionAutocompleteQuery extends ComposeAutocompleteQuery {
 
   WildcardMentionAutocompleteResult? testWildcardOption(WildcardMentionOption wildcardOption, {
       required ZulipLocalizations localizations}) {
-    // TODO(#237): match insensitively to diacritics
     final localized = wildcardOption.localizedCanonicalString(localizations);
     final matches = wildcardOption.canonicalString.contains(_normalized)
-      || localized.toLowerCase().contains(_normalized);
+      || AutocompleteQuery.lowercaseAndStripDiacritics(localized).contains(_normalized);
     if (!matches) return null;
     return WildcardMentionAutocompleteResult(
       wildcardOption: wildcardOption, rank: _rankWildcardResult);
@@ -982,7 +990,8 @@ class AutocompleteDataCache {
 
   /// The normalized `fullName` of [user].
   String normalizedNameForUser(User user) {
-    return _normalizedNamesByUser[user.userId] ??= user.fullName.toLowerCase();
+    return _normalizedNamesByUser[user.userId]
+      ??= AutocompleteQuery.lowercaseAndStripDiacritics(user.fullName);
   }
 
   final Map<int, List<String>> _normalizedNameWordsByUser = {};
@@ -996,14 +1005,18 @@ class AutocompleteDataCache {
 
   /// The normalized `deliveryEmail` of [user], or null if that's null.
   String? normalizedEmailForUser(User user) {
-    return _normalizedEmailsByUser[user.userId] ??= user.deliveryEmail?.toLowerCase();
+    return _normalizedEmailsByUser[user.userId]
+      ??= (user.deliveryEmail != null
+            ? AutocompleteQuery.lowercaseAndStripDiacritics(user.deliveryEmail!)
+            : null);
   }
 
   final Map<int, String> _normalizedNamesByUserGroup = {};
 
   /// The normalized `name` of [userGroup].
   String normalizedNameForUserGroup(UserGroup userGroup) {
-    return _normalizedNamesByUserGroup[userGroup.id] ??= userGroup.name.toLowerCase();
+    return _normalizedNamesByUserGroup[userGroup.id]
+      ??= AutocompleteQuery.lowercaseAndStripDiacritics(userGroup.name);
   }
 
   final Map<int, List<String>> _normalizedNameWordsByUserGroup = {};
@@ -1203,11 +1216,11 @@ class TopicAutocompleteQuery extends AutocompleteQuery {
     // TODO(#881): Sort by match relevance, like web does.
 
     if (topic.displayName == null) {
-      return store.realmEmptyTopicDisplayName.toLowerCase()
+      return AutocompleteQuery.lowercaseAndStripDiacritics(store.realmEmptyTopicDisplayName)
         .contains(_normalized);
     }
     return topic.displayName != raw
-      && topic.displayName!.toLowerCase().contains(_normalized);
+      && AutocompleteQuery.lowercaseAndStripDiacritics(topic.displayName!).contains(_normalized);
   }
 
   @override
diff --git a/lib/model/emoji.dart b/lib/model/emoji.dart
@@ -74,21 +74,21 @@ final class EmojiCandidate {
   /// This might not be the only name this emoji has; see [aliases].
   final String emojiName;
 
-  /// [emojiName], but lowercased to support fuzzy matching.
-  // TODO(#1067) also remove diacritics
+  /// [emojiName], but via [AutocompleteQuery.lowercaseAndStripDiacritics]
+  /// to support fuzzy matching.
   String get normalizedEmojiName => _normalizedEmojiName
-    ??= emojiName.toLowerCase();
+    ??= AutocompleteQuery.lowercaseAndStripDiacritics(emojiName);
   String? _normalizedEmojiName;
 
   /// Additional Zulip "emoji name" values for this emoji,
   /// to show in the emoji picker UI.
   Iterable<String> get aliases => _aliases ?? const [];
   final List<String>? _aliases;
 
-  /// [aliases], but lowercased to support fuzzy matching.
-  // TODO(#1067) also remove diacritics
+  /// [aliases], but via [AutocompleteQuery.lowercaseAndStripDiacritics]
+  /// to support fuzzy matching.
   Iterable<String> get normalizedAliases => _normalizedAliases
-    ??= aliases.map((alias) => alias.toLowerCase());
+    ??= aliases.map((alias) => AutocompleteQuery.lowercaseAndStripDiacritics(alias));
   Iterable<String>? _normalizedAliases;
 
   final EmojiDisplay emojiDisplay;
@@ -513,9 +513,8 @@ class EmojiAutocompleteQuery extends ComposeAutocompleteQuery {
 
   static const _separator = '_';
 
-  static String _adjustQuery(String raw) {
-    return raw.toLowerCase().replaceAll(' ', '_'); // TODO(#1067) remove diacritics too
-  }
+  static String _adjustQuery(String raw) =>
+    AutocompleteQuery.lowercaseAndStripDiacritics(raw.replaceAll(' ', '_'));
 
   @override
   EmojiAutocompleteView initViewModel({
diff --git a/test/model/autocomplete_test.dart b/test/model/autocomplete_test.dart
@@ -932,9 +932,11 @@ void main() {
     }
 
     for (final option in WildcardMentionOption.values) {
-      // These are hard-coded, and they happened to be lowercase when written.
+      // These are hard-coded, and they happened to be lowercase and without
+      // diacritics when written.
       // Throw if that changes, to not accidentally break fuzzy matching.
-      check(option.canonicalString).equals(option.canonicalString.toLowerCase());
+      check(option.canonicalString).equals(
+        AutocompleteQuery.lowercaseAndStripDiacritics(option.canonicalString));
     }
 
     final testArabic = wildcardTesterForLocale((locale) => locale.languageCode == 'ar');
@@ -956,6 +958,10 @@ void main() {
     testGerman('Thema',     topicNarrow,   [WildcardMentionOption.topic]);
     testGerman('thema',     topicNarrow,   [WildcardMentionOption.topic]);
 
+    final testPolish = wildcardTesterForLocale((locale) => locale.languageCode == 'pl');
+    testPolish('wątek',     topicNarrow,   [WildcardMentionOption.topic]);
+    testPolish('watek',     topicNarrow,   [WildcardMentionOption.topic]);
+
     test('no wildcards for a silent mention', () {
       check(getWildcardOptionsFor('', isSilent: true, narrow: channelNarrow))
         .isEmpty();
@@ -1075,6 +1081,14 @@ void main() {
       check(rankOf(query, a)!).equals(rankOf(query, b)!);
     }
 
+    void checkAllSameRank(String query, Iterable<Object> candidates) {
+      // (i.e. throw here if it's not a match)
+      final firstCandidateRank = rankOf(query, candidates.first)!;
+
+      final ranks = candidates.skip(1).map((candidate) => rankOf(query, candidate));
+      check(ranks).every((it) => it.equals(firstCandidateRank));
+    }
+
     test('wildcards, then users', () {
       checkSameRank('', WildcardMentionOption.all, WildcardMentionOption.topic);
       checkPrecedes('', WildcardMentionOption.topic, eg.user());
@@ -1087,13 +1101,28 @@ void main() {
       checkPrecedes(user.fullName, WildcardMentionOption.channel, user);
     });
 
-    test('user name matched case-insensitively', () {
-      final user1 = eg.user(fullName: 'Chris Bobbe');
-      final user2 = eg.user(fullName: 'chris bobbe');
+    test('user name match is case- and diacritics-insensitive', () {
+      final users = [
+        eg.user(fullName: 'Édith Piaf'),
+        eg.user(fullName: 'édith piaf'),
+        eg.user(fullName: 'Edith Piaf'),
+        eg.user(fullName: 'edith piaf'),
+      ];
+
+      checkAllSameRank('Édith Piaf', users); // exact
+      checkAllSameRank('Edith Piaf', users); // exact
+      checkAllSameRank('édith piaf', users); // exact
+      checkAllSameRank('edith piaf', users); // exact
 
-      checkSameRank('chris bobbe', user1, user2); // exact
-      checkSameRank('chris bo',    user1, user2); // total-prefix
-      checkSameRank('chr bo',      user1, user2); // word-prefixes
+      checkAllSameRank('Édith Pi',   users); // total-prefix
+      checkAllSameRank('Edith Pi',   users); // total-prefix
+      checkAllSameRank('édith pi',   users); // total-prefix
+      checkAllSameRank('edith pi',   users); // total-prefix
+
+      checkAllSameRank('Éd Pi',      users); // word-prefixes
+      checkAllSameRank('Ed Pi',      users); // word-prefixes
+      checkAllSameRank('éd pi',      users); // word-prefixes
+      checkAllSameRank('ed pi',      users); // word-prefixes
     });
 
     test('user name match: exact over total-prefix', () {
@@ -1110,13 +1139,16 @@ void main() {
       checkPrecedes('so m', user1, user2);
     });
 
-    test('group name matched case-insensitively', () {
-      final userGroup1 = eg.userGroup(name: 'Mobile Team');
-      final userGroup2 = eg.userGroup(name: 'mobile team');
+    test('group name is case- and diacritics-insensitive', () {
+      final userGroups = [
+        eg.userGroup(name: 'Mobile Team'),
+        eg.userGroup(name: 'mobile team'),
+        eg.userGroup(name: 'möbile team'),
+      ];
 
-      checkSameRank('mobile team', userGroup1, userGroup2); // exact
-      checkSameRank('mobile te',   userGroup1, userGroup2); // total-prefix
-      checkSameRank('mob te',      userGroup1, userGroup2); // word-prefixes
+      checkAllSameRank('mobile team', userGroups); // exact
+      checkAllSameRank('mobile te',   userGroups); // total-prefix
+      checkAllSameRank('mob te',      userGroups); // word-prefixes
     });
 
     test('group name match: exact over total-prefix', () {
@@ -1133,16 +1165,19 @@ void main() {
       checkPrecedes('so m', userGroup1, userGroup2);
     });
 
-    test('email matched case-insensitively', () {
+    test('email match is case- and diacritics-insensitive', () {
       // "z" name to prevent accidental name match with example data
-      final user1 = eg.user(fullName: 'z', deliveryEmail: 'email@example.com');
-      final user2 = eg.user(fullName: 'z', deliveryEmail: 'EmAiL@ExAmPlE.com');
-
-      checkSameRank('email@example.com', user1, user2);
-      checkSameRank('email@e',           user1, user2);
-      checkSameRank('email@',            user1, user2);
-      checkSameRank('email',             user1, user2);
-      checkSameRank('ema',               user1, user2);
+      final users = [
+        eg.user(fullName: 'z', deliveryEmail: 'email@example.com'),
+        eg.user(fullName: 'z', deliveryEmail: 'EmAiL@ExAmPlE.com'),
+        eg.user(fullName: 'z', deliveryEmail: 'ēmail@example.com'),
+      ];
+
+      checkAllSameRank('email@example.com', users);
+      checkAllSameRank('email@e',           users);
+      checkAllSameRank('email@',            users);
+      checkAllSameRank('email',             users);
+      checkAllSameRank('ema',               users);
     });
 
     test('email match is by prefix only', () {
diff --git a/test/model/emoji_test.dart b/test/model/emoji_test.dart
@@ -558,8 +558,10 @@ void main() {
       check(matchOfName('blue dia', 'large_blue_diamond')).wordAligned;
     });
 
-    test('query is lower-cased', () {
+    test('case-insensitive', () {
       check(matchOfName('Smi', 'smile')).prefix;
+      check(matchOfName('smi', 'SMILE')).prefix;
+      check(matchOfName('SmI', 'sMiLe')).prefix;
     });
 
     test('query matches aliases same way as primary name', () {
@@ -577,6 +579,8 @@ void main() {
       check(matchOfNames('blue_dia', ['x', 'large_blue_diamond'])).wordAligned;
 
       check(matchOfNames('Smi', ['x', 'smile'])).prefix;
+      check(matchOfNames('smi', ['x', 'SMILE'])).prefix;
+      check(matchOfNames('SmI', ['x', 'sMiLe'])).prefix;
     });
 
     test('best match among name and aliases prevails', () {