Skip to content

Commit bb324c7

Browse files
authored
ui: Improve fuzzy matching on camelcase strings by splitting them into tokens (#5054)
Fuzzy currently doesn't work well for CamelCase strings e.g. searching for 'case' would not match 'CamelCase', because MiniSearch works on tokens but only splits on spaces and punctuation (e.g. underscores). This PR adds a new tokenizer that splits CamelCase strings as well e.g. 'Camel' and 'Case'.
1 parent c812dd6 commit bb324c7

File tree

2 files changed

+52
-0
lines changed

2 files changed

+52
-0
lines changed

ui/src/base/fuzzy.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ export class FuzzyFinder<T> {
4242
const docs = items.map((item, i) => ({id: i, text: keyLookup(item)}));
4343
this.miniSearch = new MiniSearch({
4444
fields: ['text'],
45+
tokenize: camelCaseTokenize,
4546
searchOptions: {
47+
tokenize: camelCaseTokenize,
4648
// Allow 1 edit for short terms, ~20% for longer ones.
4749
fuzzy: (term: string) =>
4850
term.length <= 3 ? 1 : Math.ceil(term.length * 0.2),
@@ -73,6 +75,24 @@ export class FuzzyFinder<T> {
7375
}
7476
}
7577

78+
// Tokenize text by splitting on whitespace/punctuation AND camelCase boundaries.
79+
// E.g. "dev.perfetto.LiveMemory" -> ["dev", "perfetto", "Live", "Memory"]
80+
// This allows searching for "memory" to match "LiveMemory".
81+
function camelCaseTokenize(text: string): string[] {
82+
// First split on non-alphanumeric characters (dots, spaces, underscores, etc.)
83+
const coarseTokens = text.split(/[^a-zA-Z0-9]+/).filter(Boolean);
84+
const tokens: string[] = [];
85+
for (const token of coarseTokens) {
86+
// Split camelCase: insert boundary before uppercase letter preceded by
87+
// a lowercase letter, or before an uppercase letter followed by a
88+
// lowercase letter when preceded by uppercase (e.g. "XMLParser" ->
89+
// ["XML", "Parser"]).
90+
const parts = token.split(/(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])/);
91+
tokens.push(...parts);
92+
}
93+
return tokens;
94+
}
95+
7696
// Given a query (possibly multi-word) and candidate text, compute highlight
7797
// segments. Each query token is first tried as a substring match, then falls
7898
// back to sequential character matching.

ui/src/base/fuzzy_unittest.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,38 @@ describe('FuzzyFinder', () => {
8383
});
8484
});
8585

86+
describe('FuzzyFinder camelCase tokenization', () => {
87+
const items = [
88+
'dev.perfetto.LiveMemory',
89+
'dev.perfetto.RecordTraceV2',
90+
'com.android.XMLParser',
91+
];
92+
const finder = new FuzzyFinder(items, (x) => x);
93+
94+
it('finds camelCase sub-word', () => {
95+
const result = finder.find('memory');
96+
expect(result).toEqual(
97+
expect.arrayContaining([
98+
expect.objectContaining({item: 'dev.perfetto.LiveMemory'}),
99+
]),
100+
);
101+
});
102+
103+
it('finds dotted segment', () => {
104+
const result = finder.find('perfetto');
105+
expect(result.length).toBeGreaterThanOrEqual(2);
106+
});
107+
108+
it('finds uppercase acronym split', () => {
109+
const result = finder.find('parser');
110+
expect(result).toEqual(
111+
expect.arrayContaining([
112+
expect.objectContaining({item: 'com.android.XMLParser'}),
113+
]),
114+
);
115+
});
116+
});
117+
86118
test('fuzzyMatch', () => {
87119
expect(fuzzyMatch('foo bar baz', 'foo')).toEqual({
88120
matches: true,

0 commit comments

Comments
 (0)