Skip to content

Commit 1af7a25

Browse files
committed
autocomplete: Match insensitively to diacritics
Fixes zulip#237. Fixes zulip#1067.
1 parent b342ec3 commit 1af7a25

File tree

4 files changed

+97
-46
lines changed

4 files changed

+97
-46
lines changed

lib/model/autocomplete.dart

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import 'dart:math';
22

33
import 'package:flutter/foundation.dart';
44
import 'package:flutter/services.dart';
5+
import 'package:unorm_dart/unorm_dart.dart' as unorm;
56

67
import '../api/model/events.dart';
78
import '../api/model/model.dart';
@@ -732,12 +733,12 @@ class MentionAutocompleteView extends AutocompleteView<MentionAutocompleteQuery,
732733
/// An [AutocompleteQuery] object stores the user's actual query string
733734
/// as [raw].
734735
/// It may also store processed forms of the query
735-
/// (for example, converted to lowercase or split on whitespace)
736+
/// (for example, normalized by case and diacritics, or split on whitespace)
736737
/// to prepare for whatever particular form of searching will be done
737738
/// for the given type of autocomplete interaction.
738739
abstract class AutocompleteQuery {
739740
AutocompleteQuery(this.raw) {
740-
_normalized = raw.toLowerCase();
741+
_normalized = lowercaseAndStripDiacritics(raw);
741742
// TODO(#1805) split on space characters that the user is actually using
742743
// (e.g. U+3000 IDEOGRAPHIC SPACE);
743744
// could check active keyboard or just split on all kinds of spaces
@@ -751,15 +752,23 @@ abstract class AutocompleteQuery {
751752

752753
late final List<String> _normalizedWords;
753754

755+
static final RegExp _regExpStripMarkCharacters = RegExp(r'\p{M}', unicode: true);
756+
757+
static String lowercaseAndStripDiacritics(String input) {
758+
// Anders reports that this is what web does; see discussion:
759+
// https://chat.zulip.org/#narrow/channel/48-mobile/topic/deps.3A.20Add.20new.20package.20to.20handle.20diacritics/near/2244487
760+
final lowercase = input.toLowerCase();
761+
final compatibilityNormalized = unorm.nfkd(lowercase);
762+
return compatibilityNormalized.replaceAll(_regExpStripMarkCharacters, '');
763+
}
764+
754765
/// Whether all of this query's words have matches in [words],
755-
/// modulo case, that appear in order.
766+
/// insensitively to case and diacritics, that appear in order.
756767
///
757768
/// A "match" means the word in [words] starts with the query word.
758769
///
759-
/// [words] must all be lowercased.
770+
/// [words] must all have been passed through [lowercaseAndStripDiacritics].
760771
bool _testContainsQueryWords(List<String> words) {
761-
// TODO(#237) test with diacritics stripped, where appropriate,
762-
// and update dartdoc's summary line and its restriction about [words].
763772
int wordsIndex = 0;
764773
int queryWordsIndex = 0;
765774
while (true) {
@@ -828,10 +837,9 @@ class MentionAutocompleteQuery extends ComposeAutocompleteQuery {
828837

829838
WildcardMentionAutocompleteResult? testWildcardOption(WildcardMentionOption wildcardOption, {
830839
required ZulipLocalizations localizations}) {
831-
// TODO(#237): match insensitively to diacritics
832840
final localized = wildcardOption.localizedCanonicalString(localizations);
833841
final matches = wildcardOption.canonicalString.contains(_normalized)
834-
|| localized.toLowerCase().contains(_normalized);
842+
|| AutocompleteQuery.lowercaseAndStripDiacritics(localized).contains(_normalized);
835843
if (!matches) return null;
836844
return WildcardMentionAutocompleteResult(
837845
wildcardOption: wildcardOption, rank: _rankWildcardResult);
@@ -982,7 +990,8 @@ class AutocompleteDataCache {
982990

983991
/// The normalized `fullName` of [user].
984992
String normalizedNameForUser(User user) {
985-
return _normalizedNamesByUser[user.userId] ??= user.fullName.toLowerCase();
993+
return _normalizedNamesByUser[user.userId]
994+
??= AutocompleteQuery.lowercaseAndStripDiacritics(user.fullName);
986995
}
987996

988997
final Map<int, List<String>> _normalizedNameWordsByUser = {};
@@ -996,14 +1005,18 @@ class AutocompleteDataCache {
9961005

9971006
/// The normalized `deliveryEmail` of [user], or null if that's null.
9981007
String? normalizedEmailForUser(User user) {
999-
return _normalizedEmailsByUser[user.userId] ??= user.deliveryEmail?.toLowerCase();
1008+
return _normalizedEmailsByUser[user.userId]
1009+
??= (user.deliveryEmail != null
1010+
? AutocompleteQuery.lowercaseAndStripDiacritics(user.deliveryEmail!)
1011+
: null);
10001012
}
10011013

10021014
final Map<int, String> _normalizedNamesByUserGroup = {};
10031015

10041016
/// The normalized `name` of [userGroup].
10051017
String normalizedNameForUserGroup(UserGroup userGroup) {
1006-
return _normalizedNamesByUserGroup[userGroup.id] ??= userGroup.name.toLowerCase();
1018+
return _normalizedNamesByUserGroup[userGroup.id]
1019+
??= AutocompleteQuery.lowercaseAndStripDiacritics(userGroup.name);
10071020
}
10081021

10091022
final Map<int, List<String>> _normalizedNameWordsByUserGroup = {};
@@ -1203,11 +1216,11 @@ class TopicAutocompleteQuery extends AutocompleteQuery {
12031216
// TODO(#881): Sort by match relevance, like web does.
12041217

12051218
if (topic.displayName == null) {
1206-
return store.realmEmptyTopicDisplayName.toLowerCase()
1219+
return AutocompleteQuery.lowercaseAndStripDiacritics(store.realmEmptyTopicDisplayName)
12071220
.contains(_normalized);
12081221
}
12091222
return topic.displayName != raw
1210-
&& topic.displayName!.toLowerCase().contains(_normalized);
1223+
&& AutocompleteQuery.lowercaseAndStripDiacritics(topic.displayName!).contains(_normalized);
12111224
}
12121225

12131226
@override

lib/model/emoji.dart

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,21 +74,21 @@ final class EmojiCandidate {
7474
/// This might not be the only name this emoji has; see [aliases].
7575
final String emojiName;
7676

77-
/// [emojiName], but lowercased to support fuzzy matching.
78-
// TODO(#1067) also remove diacritics
77+
/// [emojiName], but via [AutocompleteQuery.lowercaseAndStripDiacritics]
78+
/// to support fuzzy matching.
7979
String get normalizedEmojiName => _normalizedEmojiName
80-
??= emojiName.toLowerCase();
80+
??= AutocompleteQuery.lowercaseAndStripDiacritics(emojiName);
8181
String? _normalizedEmojiName;
8282

8383
/// Additional Zulip "emoji name" values for this emoji,
8484
/// to show in the emoji picker UI.
8585
Iterable<String> get aliases => _aliases ?? const [];
8686
final List<String>? _aliases;
8787

88-
/// [aliases], but lowercased to support fuzzy matching.
89-
// TODO(#1067) also remove diacritics
88+
/// [aliases], but via [AutocompleteQuery.lowercaseAndStripDiacritics]
89+
/// to support fuzzy matching.
9090
Iterable<String> get normalizedAliases => _normalizedAliases
91-
??= aliases.map((alias) => alias.toLowerCase());
91+
??= aliases.map((alias) => AutocompleteQuery.lowercaseAndStripDiacritics(alias));
9292
Iterable<String>? _normalizedAliases;
9393

9494
final EmojiDisplay emojiDisplay;
@@ -513,9 +513,8 @@ class EmojiAutocompleteQuery extends ComposeAutocompleteQuery {
513513

514514
static const _separator = '_';
515515

516-
static String _adjustQuery(String raw) {
517-
return raw.toLowerCase().replaceAll(' ', '_'); // TODO(#1067) remove diacritics too
518-
}
516+
static String _adjustQuery(String raw) =>
517+
AutocompleteQuery.lowercaseAndStripDiacritics(raw.replaceAll(' ', '_'));
519518

520519
@override
521520
EmojiAutocompleteView initViewModel({

test/model/autocomplete_test.dart

Lines changed: 58 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -932,9 +932,11 @@ void main() {
932932
}
933933

934934
for (final option in WildcardMentionOption.values) {
935-
// These are hard-coded, and they happened to be lowercase when written.
935+
// These are hard-coded, and they happened to be lowercase and without
936+
// diacritics when written.
936937
// Throw if that changes, to not accidentally break fuzzy matching.
937-
check(option.canonicalString).equals(option.canonicalString.toLowerCase());
938+
check(option.canonicalString).equals(
939+
AutocompleteQuery.lowercaseAndStripDiacritics(option.canonicalString));
938940
}
939941

940942
final testArabic = wildcardTesterForLocale((locale) => locale.languageCode == 'ar');
@@ -956,6 +958,10 @@ void main() {
956958
testGerman('Thema', topicNarrow, [WildcardMentionOption.topic]);
957959
testGerman('thema', topicNarrow, [WildcardMentionOption.topic]);
958960

961+
final testPolish = wildcardTesterForLocale((locale) => locale.languageCode == 'pl');
962+
testPolish('wątek', topicNarrow, [WildcardMentionOption.topic]);
963+
testPolish('watek', topicNarrow, [WildcardMentionOption.topic]);
964+
959965
test('no wildcards for a silent mention', () {
960966
check(getWildcardOptionsFor('', isSilent: true, narrow: channelNarrow))
961967
.isEmpty();
@@ -1075,6 +1081,14 @@ void main() {
10751081
check(rankOf(query, a)!).equals(rankOf(query, b)!);
10761082
}
10771083

1084+
void checkAllSameRank(String query, Iterable<Object> candidates) {
1085+
// (i.e. throw here if it's not a match)
1086+
final firstCandidateRank = rankOf(query, candidates.first)!;
1087+
1088+
final ranks = candidates.skip(1).map((candidate) => rankOf(query, candidate));
1089+
check(ranks).every((it) => it.equals(firstCandidateRank));
1090+
}
1091+
10781092
test('wildcards, then users', () {
10791093
checkSameRank('', WildcardMentionOption.all, WildcardMentionOption.topic);
10801094
checkPrecedes('', WildcardMentionOption.topic, eg.user());
@@ -1087,13 +1101,28 @@ void main() {
10871101
checkPrecedes(user.fullName, WildcardMentionOption.channel, user);
10881102
});
10891103

1090-
test('user name matched case-insensitively', () {
1091-
final user1 = eg.user(fullName: 'Chris Bobbe');
1092-
final user2 = eg.user(fullName: 'chris bobbe');
1104+
test('user name match is case- and diacritics-insensitive', () {
1105+
final users = [
1106+
eg.user(fullName: 'Édith Piaf'),
1107+
eg.user(fullName: 'édith piaf'),
1108+
eg.user(fullName: 'Edith Piaf'),
1109+
eg.user(fullName: 'edith piaf'),
1110+
];
1111+
1112+
checkAllSameRank('Édith Piaf', users); // exact
1113+
checkAllSameRank('Edith Piaf', users); // exact
1114+
checkAllSameRank('édith piaf', users); // exact
1115+
checkAllSameRank('edith piaf', users); // exact
10931116

1094-
checkSameRank('chris bobbe', user1, user2); // exact
1095-
checkSameRank('chris bo', user1, user2); // total-prefix
1096-
checkSameRank('chr bo', user1, user2); // word-prefixes
1117+
checkAllSameRank('Édith Pi', users); // total-prefix
1118+
checkAllSameRank('Edith Pi', users); // total-prefix
1119+
checkAllSameRank('édith pi', users); // total-prefix
1120+
checkAllSameRank('edith pi', users); // total-prefix
1121+
1122+
checkAllSameRank('Éd Pi', users); // word-prefixes
1123+
checkAllSameRank('Ed Pi', users); // word-prefixes
1124+
checkAllSameRank('éd pi', users); // word-prefixes
1125+
checkAllSameRank('ed pi', users); // word-prefixes
10971126
});
10981127

10991128
test('user name match: exact over total-prefix', () {
@@ -1110,13 +1139,16 @@ void main() {
11101139
checkPrecedes('so m', user1, user2);
11111140
});
11121141

1113-
test('group name matched case-insensitively', () {
1114-
final userGroup1 = eg.userGroup(name: 'Mobile Team');
1115-
final userGroup2 = eg.userGroup(name: 'mobile team');
1142+
test('group name is case- and diacritics-insensitive', () {
1143+
final userGroups = [
1144+
eg.userGroup(name: 'Mobile Team'),
1145+
eg.userGroup(name: 'mobile team'),
1146+
eg.userGroup(name: 'möbile team'),
1147+
];
11161148

1117-
checkSameRank('mobile team', userGroup1, userGroup2); // exact
1118-
checkSameRank('mobile te', userGroup1, userGroup2); // total-prefix
1119-
checkSameRank('mob te', userGroup1, userGroup2); // word-prefixes
1149+
checkAllSameRank('mobile team', userGroups); // exact
1150+
checkAllSameRank('mobile te', userGroups); // total-prefix
1151+
checkAllSameRank('mob te', userGroups); // word-prefixes
11201152
});
11211153

11221154
test('group name match: exact over total-prefix', () {
@@ -1133,16 +1165,19 @@ void main() {
11331165
checkPrecedes('so m', userGroup1, userGroup2);
11341166
});
11351167

1136-
test('email matched case-insensitively', () {
1168+
test('email match is case- and diacritics-insensitive', () {
11371169
// "z" name to prevent accidental name match with example data
1138-
final user1 = eg.user(fullName: 'z', deliveryEmail: '[email protected]');
1139-
final user2 = eg.user(fullName: 'z', deliveryEmail: '[email protected]');
1140-
1141-
checkSameRank('[email protected]', user1, user2);
1142-
checkSameRank('email@e', user1, user2);
1143-
checkSameRank('email@', user1, user2);
1144-
checkSameRank('email', user1, user2);
1145-
checkSameRank('ema', user1, user2);
1170+
final users = [
1171+
eg.user(fullName: 'z', deliveryEmail: '[email protected]'),
1172+
eg.user(fullName: 'z', deliveryEmail: '[email protected]'),
1173+
eg.user(fullName: 'z', deliveryEmail: [email protected]'),
1174+
];
1175+
1176+
checkAllSameRank('[email protected]', users);
1177+
checkAllSameRank('email@e', users);
1178+
checkAllSameRank('email@', users);
1179+
checkAllSameRank('email', users);
1180+
checkAllSameRank('ema', users);
11461181
});
11471182

11481183
test('email match is by prefix only', () {

test/model/emoji_test.dart

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -558,8 +558,10 @@ void main() {
558558
check(matchOfName('blue dia', 'large_blue_diamond')).wordAligned;
559559
});
560560

561-
test('query is lower-cased', () {
561+
test('case-insensitive', () {
562562
check(matchOfName('Smi', 'smile')).prefix;
563+
check(matchOfName('smi', 'SMILE')).prefix;
564+
check(matchOfName('SmI', 'sMiLe')).prefix;
563565
});
564566

565567
test('query matches aliases same way as primary name', () {
@@ -577,6 +579,8 @@ void main() {
577579
check(matchOfNames('blue_dia', ['x', 'large_blue_diamond'])).wordAligned;
578580

579581
check(matchOfNames('Smi', ['x', 'smile'])).prefix;
582+
check(matchOfNames('smi', ['x', 'SMILE'])).prefix;
583+
check(matchOfNames('SmI', ['x', 'sMiLe'])).prefix;
580584
});
581585

582586
test('best match among name and aliases prevails', () {

0 commit comments

Comments
 (0)