Skip to content

Commit 3ac043c

Browse files
a-johnstonsamsface
andcommitted
Add fuzzy string matching to quick open search
Co-authored-by: sam <[email protected]>
1 parent a308047 commit 3ac043c

File tree

14 files changed

+2094
-498
lines changed

14 files changed

+2094
-498
lines changed

core/string/fuzzy_search.cpp

Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
/**************************************************************************/
2+
/* fuzzy_search.cpp */
3+
/**************************************************************************/
4+
/* This file is part of: */
5+
/* GODOT ENGINE */
6+
/* https://godotengine.org */
7+
/**************************************************************************/
8+
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
9+
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
10+
/* */
11+
/* Permission is hereby granted, free of charge, to any person obtaining */
12+
/* a copy of this software and associated documentation files (the */
13+
/* "Software"), to deal in the Software without restriction, including */
14+
/* without limitation the rights to use, copy, modify, merge, publish, */
15+
/* distribute, sublicense, and/or sell copies of the Software, and to */
16+
/* permit persons to whom the Software is furnished to do so, subject to */
17+
/* the following conditions: */
18+
/* */
19+
/* The above copyright notice and this permission notice shall be */
20+
/* included in all copies or substantial portions of the Software. */
21+
/* */
22+
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
23+
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
24+
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
25+
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
26+
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
27+
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
28+
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
29+
/**************************************************************************/
30+
31+
#include "fuzzy_search.h"
32+
33+
constexpr float cull_factor = 0.1f;
34+
constexpr float cull_cutoff = 30.0f;
35+
const String boundary_chars = "/\\-_.";
36+
37+
static bool _is_valid_interval(const Vector2i &p_interval) {
38+
// Empty intervals are represented as (-1, -1).
39+
return p_interval.x >= 0 && p_interval.y >= p_interval.x;
40+
}
41+
42+
static Vector2i _extend_interval(const Vector2i &p_a, const Vector2i &p_b) {
43+
if (!_is_valid_interval(p_a)) {
44+
return p_b;
45+
}
46+
if (!_is_valid_interval(p_b)) {
47+
return p_a;
48+
}
49+
return Vector2i(MIN(p_a.x, p_b.x), MAX(p_a.y, p_b.y));
50+
}
51+
52+
static bool _is_word_boundary(const String &p_str, int p_index) {
53+
if (p_index == -1 || p_index == p_str.size()) {
54+
return true;
55+
}
56+
return boundary_chars.find_char(p_str[p_index]) != -1;
57+
}
58+
59+
bool FuzzySearchToken::try_exact_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset) const {
60+
p_match.token_idx = idx;
61+
p_match.token_length = string.length();
62+
int match_idx = p_target.find(string, p_offset);
63+
if (match_idx == -1) {
64+
return false;
65+
}
66+
p_match.add_substring(match_idx, string.length());
67+
return true;
68+
}
69+
70+
bool FuzzySearchToken::try_fuzzy_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset, int p_miss_budget) const {
71+
p_match.token_idx = idx;
72+
p_match.token_length = string.length();
73+
int run_start = -1;
74+
int run_len = 0;
75+
76+
// Search for the subsequence p_token in p_target starting from p_offset, recording each substring for
77+
// later scoring and display.
78+
for (int i = 0; i < string.length(); i++) {
79+
int new_offset = p_target.find_char(string[i], p_offset);
80+
if (new_offset < 0) {
81+
p_miss_budget--;
82+
if (p_miss_budget < 0) {
83+
return false;
84+
}
85+
} else {
86+
if (run_start == -1 || p_offset != new_offset) {
87+
if (run_start != -1) {
88+
p_match.add_substring(run_start, run_len);
89+
}
90+
run_start = new_offset;
91+
run_len = 1;
92+
} else {
93+
run_len += 1;
94+
}
95+
p_offset = new_offset + 1;
96+
}
97+
}
98+
99+
if (run_start != -1) {
100+
p_match.add_substring(run_start, run_len);
101+
}
102+
103+
return true;
104+
}
105+
106+
void FuzzyTokenMatch::add_substring(int p_substring_start, int p_substring_length) {
107+
substrings.append(Vector2i(p_substring_start, p_substring_length));
108+
matched_length += p_substring_length;
109+
Vector2i substring_interval = { p_substring_start, p_substring_start + p_substring_length - 1 };
110+
interval = _extend_interval(interval, substring_interval);
111+
}
112+
113+
bool FuzzyTokenMatch::intersects(const Vector2i &p_other_interval) const {
114+
if (!_is_valid_interval(interval) || !_is_valid_interval(p_other_interval)) {
115+
return false;
116+
}
117+
return interval.y >= p_other_interval.x && interval.x <= p_other_interval.y;
118+
}
119+
120+
bool FuzzySearchResult::can_add_token_match(const FuzzyTokenMatch &p_match) const {
121+
if (p_match.get_miss_count() > miss_budget) {
122+
return false;
123+
}
124+
125+
if (p_match.intersects(match_interval)) {
126+
if (token_matches.size() == 1) {
127+
return false;
128+
}
129+
for (const FuzzyTokenMatch &existing_match : token_matches) {
130+
if (existing_match.intersects(p_match.interval)) {
131+
return false;
132+
}
133+
}
134+
}
135+
136+
return true;
137+
}
138+
139+
bool FuzzyTokenMatch::is_case_insensitive(const String &p_original, const String &p_adjusted) const {
140+
for (const Vector2i &substr : substrings) {
141+
const int end = substr.x + substr.y;
142+
for (int i = substr.x; i < end; i++) {
143+
if (p_original[i] != p_adjusted[i]) {
144+
return true;
145+
}
146+
}
147+
}
148+
return false;
149+
}
150+
151+
void FuzzySearchResult::score_token_match(FuzzyTokenMatch &p_match, bool p_case_insensitive) const {
152+
// This can always be tweaked more. The intuition is that exact matches should almost always
153+
// be prioritized over broken up matches, and other criteria more or less act as tie breakers.
154+
155+
p_match.score = -20 * p_match.get_miss_count() - (p_case_insensitive ? 3 : 0);
156+
157+
for (const Vector2i &substring : p_match.substrings) {
158+
// Score longer substrings higher than short substrings.
159+
int substring_score = substring.y * substring.y;
160+
// Score matches deeper in path higher than shallower matches
161+
if (substring.x > dir_index) {
162+
substring_score *= 2;
163+
}
164+
// Score matches on a word boundary higher than matches within a word
165+
if (_is_word_boundary(target, substring.x - 1) || _is_word_boundary(target, substring.x + substring.y)) {
166+
substring_score += 4;
167+
}
168+
// Score exact query matches higher than non-compact subsequence matches
169+
if (substring.y == p_match.token_length) {
170+
substring_score += 100;
171+
}
172+
p_match.score += substring_score;
173+
}
174+
}
175+
176+
void FuzzySearchResult::maybe_apply_score_bonus() {
177+
// This adds a small bonus to results which match tokens in the same order they appear in the query.
178+
int *token_range_starts = (int *)alloca(sizeof(int) * token_matches.size());
179+
180+
for (const FuzzyTokenMatch &match : token_matches) {
181+
token_range_starts[match.token_idx] = match.interval.x;
182+
}
183+
184+
int last = token_range_starts[0];
185+
for (int i = 1; i < token_matches.size(); i++) {
186+
if (last > token_range_starts[i]) {
187+
return;
188+
}
189+
last = token_range_starts[i];
190+
}
191+
192+
score += 1;
193+
}
194+
195+
void FuzzySearchResult::add_token_match(const FuzzyTokenMatch &p_match) {
196+
score += p_match.score;
197+
match_interval = _extend_interval(match_interval, p_match.interval);
198+
miss_budget -= p_match.get_miss_count();
199+
token_matches.append(p_match);
200+
}
201+
202+
void remove_low_scores(Vector<FuzzySearchResult> &p_results, float p_cull_score) {
203+
// Removes all results with score < p_cull_score in-place.
204+
int i = 0;
205+
int j = p_results.size() - 1;
206+
FuzzySearchResult *results = p_results.ptrw();
207+
208+
while (true) {
209+
// Advances i to an element to remove and j to an element to keep.
210+
while (j >= i && results[j].score < p_cull_score) {
211+
j--;
212+
}
213+
while (i < j && results[i].score >= p_cull_score) {
214+
i++;
215+
}
216+
if (i >= j) {
217+
break;
218+
}
219+
results[i++] = results[j--];
220+
}
221+
222+
p_results.resize(j + 1);
223+
}
224+
225+
void FuzzySearch::sort_and_filter(Vector<FuzzySearchResult> &p_results) const {
226+
if (p_results.is_empty()) {
227+
return;
228+
}
229+
230+
float avg_score = 0;
231+
float max_score = 0;
232+
233+
for (const FuzzySearchResult &result : p_results) {
234+
avg_score += result.score;
235+
max_score = MAX(max_score, result.score);
236+
}
237+
238+
// TODO: Tune scoring and culling here to display fewer subsequence soup matches when good matches
239+
// are available.
240+
avg_score /= p_results.size();
241+
float cull_score = MIN(cull_cutoff, Math::lerp(avg_score, max_score, cull_factor));
242+
remove_low_scores(p_results, cull_score);
243+
244+
struct FuzzySearchResultComparator {
245+
bool operator()(const FuzzySearchResult &p_lhs, const FuzzySearchResult &p_rhs) const {
246+
// Sort on (score, length, alphanumeric) to ensure consistent ordering.
247+
if (p_lhs.score == p_rhs.score) {
248+
if (p_lhs.target.length() == p_rhs.target.length()) {
249+
return p_lhs.target < p_rhs.target;
250+
}
251+
return p_lhs.target.length() < p_rhs.target.length();
252+
}
253+
return p_lhs.score > p_rhs.score;
254+
}
255+
};
256+
257+
SortArray<FuzzySearchResult, FuzzySearchResultComparator> sorter;
258+
259+
if (p_results.size() > max_results) {
260+
sorter.partial_sort(0, p_results.size(), max_results, p_results.ptrw());
261+
p_results.resize(max_results);
262+
} else {
263+
sorter.sort(p_results.ptrw(), p_results.size());
264+
}
265+
}
266+
267+
void FuzzySearch::set_query(const String &p_query) {
268+
tokens.clear();
269+
for (const String &string : p_query.split(" ", false)) {
270+
tokens.append({ static_cast<int>(tokens.size()), string });
271+
}
272+
273+
case_sensitive = !p_query.is_lowercase();
274+
275+
struct TokenComparator {
276+
bool operator()(const FuzzySearchToken &A, const FuzzySearchToken &B) const {
277+
if (A.string.length() == B.string.length()) {
278+
return A.idx < B.idx;
279+
}
280+
return A.string.length() > B.string.length();
281+
}
282+
};
283+
284+
// Prioritize matching longer tokens before shorter ones since match overlaps are not accepted.
285+
tokens.sort_custom<TokenComparator>();
286+
}
287+
288+
bool FuzzySearch::search(const String &p_target, FuzzySearchResult &p_result) const {
289+
p_result.target = p_target;
290+
p_result.dir_index = p_target.rfind_char('/');
291+
p_result.miss_budget = max_misses;
292+
293+
String adjusted_target = case_sensitive ? p_target : p_target.to_lower();
294+
295+
// For each token, eagerly generate subsequences starting from index 0 and keep the best scoring one
296+
// which does not conflict with prior token matches. This is not ensured to find the highest scoring
297+
// combination of matches, or necessarily the highest scoring single subsequence, as it only considers
298+
// eager subsequences for a given index, and likewise eagerly finds matches for each token in sequence.
299+
for (const FuzzySearchToken &token : tokens) {
300+
FuzzyTokenMatch best_match;
301+
int offset = start_offset;
302+
303+
while (true) {
304+
FuzzyTokenMatch match;
305+
if (allow_subsequences) {
306+
if (!token.try_fuzzy_match(match, adjusted_target, offset, p_result.miss_budget)) {
307+
break;
308+
}
309+
} else {
310+
if (!token.try_exact_match(match, adjusted_target, offset)) {
311+
break;
312+
}
313+
}
314+
if (p_result.can_add_token_match(match)) {
315+
p_result.score_token_match(match, match.is_case_insensitive(p_target, adjusted_target));
316+
if (best_match.token_idx == -1 || best_match.score < match.score) {
317+
best_match = match;
318+
}
319+
}
320+
if (_is_valid_interval(match.interval)) {
321+
offset = match.interval.x + 1;
322+
} else {
323+
break;
324+
}
325+
}
326+
327+
if (best_match.token_idx == -1) {
328+
return false;
329+
}
330+
331+
p_result.add_token_match(best_match);
332+
}
333+
334+
p_result.maybe_apply_score_bonus();
335+
return true;
336+
}
337+
338+
void FuzzySearch::search_all(const PackedStringArray &p_targets, Vector<FuzzySearchResult> &p_results) const {
339+
p_results.clear();
340+
341+
for (const String &target : p_targets) {
342+
FuzzySearchResult result;
343+
if (search(target, result)) {
344+
p_results.append(result);
345+
}
346+
}
347+
348+
sort_and_filter(p_results);
349+
}

0 commit comments

Comments
 (0)