Skip to content

Commit 8dc2576

Browse files
authored
Feat/improved ranking (#11)
## v2.0.0 Major update with improved search capabilities and updated configuration. ### Added - Substring search - Prefix search ### Changed - **BREAKING**: Adjusted indexing and query configuration. See also /src/default-config.ts for the new structure. Related to #2 and #6.
1 parent 8d2cda6 commit 8dc2576

File tree

105 files changed

+9425
-5246
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+9425
-5246
lines changed

.vscode/tasks.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
"isDefault": true
1212
},
1313
"label": "tsc: watch - tsconfig.json"
14+
},
15+
{
16+
"label": "Performance Test",
17+
"type": "shell",
18+
"command": "node dist/performance-test/main.js",
19+
"problemMatcher": [],
20+
"group": "test"
1421
}
1522
]
1623
}

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [2.0.0] - 2025-10-28
9+
10+
### Added
11+
12+
- Substring search
13+
- Prefix search
14+
15+
### Changed
16+
17+
- **BREAKING**: Adjusted indexing and query configuration. See also [default-config.ts](./src/default-config.ts) for the new structure.

readme.md renamed to README.md

Lines changed: 92 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
# Frontend Fuzzy Search
1+
# Frontend Fuzzy + Substring + Prefix Search
22

33
@m31coding/fuzzy-search is a frontend library for searching objects with ids (entities) by their names and features (terms). It is
44

55
- Fast: A query takes usually well below 10 ms.
6-
- Accurate: Powered by n-grams with a novel approach of character sorting.
6+
- Accurate: Powered by a suffix array and n-grams with a novel approach of character sorting.
77
- Multilingual: The language-agnostic design of the algorithm enables operation across all languages.
88
- Flexible: Entities and their terms can be inserted, updated and removed.
99
- Reliable: Well tested standalone library with no dependencies.
10+
- Universal: Works seamlessly in both frontend and backend (Node.js) environments.
1011

1112
[![license](https://img.shields.io/badge/license-MIT-brightgreen)](https://github.com/m31coding/fuzzy-search/blob/master/LICENSE)
1213
[![npm version](https://img.shields.io/npm/v/%40m31coding%2Ffuzzy-search)](https://www.npmjs.com/package/@m31coding/fuzzy-search)
@@ -57,24 +58,26 @@ const persons = [
5758
{ id: 11923, firstName: 'Charlie', lastName: 'Rook' }
5859
];
5960

61+
function log<T>(obj: T): void {
62+
console.log(JSON.stringify(obj, null, 2));
63+
}
64+
6065
const indexingMeta = searcher.indexEntities(
6166
persons,
6267
(e) => e.id,
6368
(e) => [e.firstName, e.lastName, `${e.firstName} ${e.lastName}`]
6469
);
65-
console.dir(indexingMeta);
70+
log(indexingMeta);
6671
/* {
6772
"entries": {
68-
"numberOfInvalidTerms": 0,
69-
"numberOfDistinctTerms": 12,
70-
"normalizationDuration": 0,
71-
"numberOfSurrogateCharacters": 0,
72-
"indexingDuration": 1
73+
"numberOfTerms": 12,
74+
"indexingDurationTotal": 1,
75+
...
7376
}
7477
} */
7578

7679
const result = searcher.getMatches(new fuzzySearch.Query('alice kign'));
77-
console.dir(result);
80+
log(result);
7881
/* {
7982
"matches": [
8083
{
@@ -90,17 +93,30 @@ console.dir(result);
9093
"query": {
9194
"string": "alice kign",
9295
"topN": 10,
93-
"minQuality": 0.3
96+
"searchers": [
97+
{
98+
"type": "fuzzy",
99+
"minQuality": 0.3
100+
},
101+
{
102+
"type": "substring",
103+
"minQuality": 0
104+
},
105+
{
106+
"type": "prefix",
107+
"minQuality": 0
108+
}
109+
]
94110
},
95111
"meta": {
96112
"entries": {
97-
"queryDuration": 0
113+
"queryDuration": 1
98114
}
99115
}
100116
} */
101117

102118
const removalResult = searcher.removeEntities([99234, 5823]);
103-
console.dir(removalResult);
119+
log(removalResult);
104120
/* {
105121
"removedEntities": [
106122
99234,
@@ -125,19 +141,17 @@ const upsertMeta = searcher.upsertEntities(
125141
(e) => e.id,
126142
(e) => [e.firstName, e.lastName, `${e.firstName} ${e.lastName}`]
127143
);
128-
console.dir(upsertMeta);
144+
log(upsertMeta);
129145
/* {
130146
"entries": {
131-
"numberOfInvalidTerms": 0,
132-
"numberOfDistinctTerms": 12,
133-
"normalizationDuration": 0,
134-
"numberOfSurrogateCharacters": 0,
135-
"upsertDuration": 0
147+
"numberOfTerms": 12,
148+
"upsertDuration": 0,
149+
...
136150
}
137151
} */
138152

139153
const result2 = searcher.getMatches(new fuzzySearch.Query('allie'));
140-
console.dir(result2);
154+
log(result2);
141155
/* {
142156
"matches": [
143157
{
@@ -146,14 +160,27 @@ console.dir(result2);
146160
"firstName": "Allie",
147161
"lastName": "King"
148162
},
149-
"quality": 1,
163+
"quality": 3,
150164
"matchedString": "Allie"
151165
}
152166
],
153167
"query": {
154168
"string": "allie",
155169
"topN": 10,
156-
"minQuality": 0.3
170+
"searchers": [
171+
{
172+
"type": "fuzzy",
173+
"minQuality": 0.3
174+
},
175+
{
176+
"type": "substring",
177+
"minQuality": 0
178+
},
179+
{
180+
"type": "prefix",
181+
"minQuality": 0
182+
}
183+
]
157184
},
158185
"meta": {
159186
"entries": {
@@ -169,7 +196,9 @@ The following parameters are available when creating a query:
169196
| --------- | ---- | ------- | ----------- |
170197
| string | string | - | The query string. |
171198
| topN | number | 10 | The maximum number of matches to return. Provide Infinity to return all matches. |
172-
| minQuality | number | 0.3 | The minimum quality of a match, ranging from 0 to 1. When set to zero, all terms that share at least one common n-gram with the query are considered a match. |
199+
| searchers | SearcherSpec[] | [new FuzzySearcher(0.3), new SubstringSearcher(0), new PrefixSearcher(0)] | The searchers to use and the minimum quality thresholds for their matches. |
200+
201+
A fuzzy search minimum quality threshold below 0.3 is not recommended, as the respective matches are most likely irrelevant.
173202

174203
If the data terms contain characters and strings in non-latin scripts (such as Arabic, Cyrillic, Greek, Han, ... see also [ISO 15924](https://en.wikipedia.org/wiki/ISO_15924)), the default configuration must be adjusted before creating the searcher:
175204

@@ -218,33 +247,29 @@ Query strings and data terms are normalized in the following normalization pipel
218247
- Strings are normalized to NFKD.
219248
- Space equivalent characters are replaced by a space.
220249
- Surrogate characters, padding characters and other non-allowed characters are removed.
221-
- Strings are padded to the left, right and in the middle (replacement of spaces).
222250

223251
>Normalization to NFKC decomposes characters by compatibility, then re-composes them by canonical equivalence. This ensures that the characters in the replacement table always match. Normalization to NFKD decomposes the characters by compatibility but does not re-compose them, allowing undesired characters to be removed thereafter.
224252
225253
The default normalizer config adopts the following values:
226254

227255
```js
228-
let paddingLeft = '$$';
229-
let paddingRight = '!';
230-
let paddingMiddle = '!$$';
231-
let replacements = [fuzzySearch.LatinReplacements.Value];
256+
config.normalizerConfig.replacements = [fuzzySearch.LatinReplacements.Value];
232257
let spaceEquivalentCharacters = new Set(['_', '-', '', '/', ',', '\t']);
233-
let treatCharacterAsSpace = (c) => spaceEquivalentCharacters.has(c);
234-
let allowCharacter = (c) => {
258+
config.normalizerConfig.treatCharacterAsSpace = (c) => spaceEquivalentCharacters.has(c);
259+
config.normalizerConfig.allowCharacter = (c) => {
235260
return fuzzySearch.StringUtilities.isAlphanumeric(c);
236261
};
237262
```
238263

239-
With this pipeline and configuration, the string `Thanh Việt Đoàn` is normalized to `thanh viet doan` before padding. With padding applied, it becomes `$$thanh!$$viet!$$doan!`. The choice of the padding is explained in the next section.
264+
With this pipeline and configuration, the string `Thanh Việt Đoàn` is normalized to `thanh viet doan`.
240265

241-
## Sorted n-grams
266+
## Fuzzy search: sorted n-grams
242267

243-
The general idea of n-grams and the sorting trick is outlined in this [blog post](https://www.m31coding.com/blog/fuzzy-search.html). In short, the data terms and the query string are broken down into 3-grams, e.g. the string `$$sarah!` becomes:
268+
The general idea of n-grams and the sorting trick is outlined in this [blog post](https://www.m31coding.com/blog/fuzzy-search.html). In short, the data terms and the query string are padded on the left, right and middle (replacement of spaces) with `$$`, `!`, and `!$$`, respectively, before they are broken down into 3-grams. For example, the string `sarah` becomes `$$sarah!` after padding and the resulting 3-grams are:
244269

245270
```text
246271
$$s, $sa, sar, ara, rah, ah!
247-
``````
272+
```
248273

249274
The more common 3-grams between the query and the term, the higher the quality of the match. By padding the front with two characters, and the back with one character, more weight is given to the beginning of the string.
250275

@@ -269,18 +294,48 @@ The quality is then computed by dividing the number of common n-grams by the num
269294
270295
Padding strings in the middle allows for extending the algorithm across word boundaries. `sarah wolff` becomes `$$sarah!$$wolff!` and matches `wolff sarah` with a quality of 0.95, if 3-grams that end with a '\$' are discarded.
271296

272-
The overall approach outlined above can be summarized as: remove n-grams that end with '\$', sort n-grams that don't contain '\$'. The default configuration appears in the code as follows:
297+
The overall approach outlined above can be summarized as: remove n-grams that end with '\$', sort n-grams that don't contain '\$'. The default fuzzy search configuration appears in the code as follows:
273298

274299
```js
275-
let ngramN = 3;
276-
let transformNgram = (ngram) =>
300+
config.fuzzySearchConfig.paddingLeft = '$$';
301+
config.fuzzySearchConfig.paddingRight = '!';
302+
config.fuzzySearchConfig.paddingMiddle = '!$$';
303+
config.fuzzySearchConfig.ngramN = 3;
304+
config.fuzzySearchConfig.transformNgram = (ngram) =>
277305
ngram.endsWith('$') ? null
278306
: ngram.indexOf('$') === -1 ? ngram.split('').sort().join('')
279307
: ngram;
308+
config.fuzzySearchConfig.inequalityPenalty = 0.05;
309+
```
310+
311+
## Substring and prefix search
280312

281-
let inequalityPenalty = 0.05;
313+
Substring and prefix search is realized with a single suffix array created by [An efficient, versatile approach to suffix sorting](https://dl.acm.org/doi/10.1145/1227161.1278374).
314+
315+
The base quality of a prefix or substring match is simply computed by dividing the query length by the term length. For example, the query `sa` matches the term `sarah` with a quality of 2/5 = 0.4, and the query `ara` matches the same term with a quality of 3/5 = 0.6.
316+
317+
A quality offset of +2 and +1 is added to prefix and substring matches, respectively, as explained in the next section.
318+
319+
The final qualities of the examples are:
320+
321+
| Query | Term | Searcher | Quality |
322+
| ----- | ----- | ----------| ----------|
323+
| sa | sarah | Prefix | 2 / 5 + 2 = 2.4 |
324+
| ara | sarah | Substring | 3 / 5 + 1 = 1.6 |
325+
326+
The default configuration for the searchers is as follows:
327+
328+
```js
329+
config.substringSearchConfig.suffixArraySeparator = '$';
282330
```
283331

332+
## Combining the searchers
333+
334+
The matches of the searchers are mixed with a simple approach. Prefix matches get a quality offset of +2, substring matches of +1, and fuzzy matches keep their original quality. The rationale is that, for the same query length, prefix matches are more relevant than substring matches. Additionally, fuzzy matches are only relevant if there are no prefix or substring matches.
335+
336+
## Changing the default configuration
337+
338+
The default configuration has been chosen carefully. There are only a few specific scenarios that require adjustments. Consult the file [default-config.ts](src/default-config.ts) for all configuration options and their default values.
284339

285340
## Support and Contribution
286341

demo/fuzzy-demo.js

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,13 @@ const dataPreview = document.getElementById('data-preview');
2323
const performanceTestRandomSeedInput = document.querySelector('#performance-test input[name="random-seed"]');
2424
const performanceTestNumberOfQueriesInput = document.querySelector('#performance-test input[name="number-of-queries"]');
2525
const performanceTestMaxMatchesInput = document.querySelector('#performance-test input[name="max-matches"]');
26-
const performanceTestMinQualityInput = document.querySelector('#performance-test input[name="min-quality"]');
26+
const performanceTestMinQualityFuzzyInput = document.querySelector('#performance-test input[name="min-quality-fuzzy"]');
27+
const performanceTestMinQualitySubstringInput = document.querySelector(
28+
'#performance-test input[name="min-quality-substring"]'
29+
);
30+
const performanceTestMinQualityPrefixInput = document.querySelector(
31+
'#performance-test input[name="min-quality-prefix"]'
32+
);
2733
document.getElementById('osm-data-card').addEventListener('click', downloadAndIndexOsmData);
2834
document
2935
.getElementById('person-data-card')
@@ -52,7 +58,9 @@ initializeParameterInput(personRandomSeedInput, parseIntInput);
5258
initializeParameterInput(performanceTestRandomSeedInput, parseIntInput);
5359
initializeParameterInput(performanceTestNumberOfQueriesInput, parsePositiveIntInput);
5460
initializeParameterInput(performanceTestMaxMatchesInput, parsePositiveIntInput);
55-
initializeParameterInput(performanceTestMinQualityInput, parsePositiveFloatInput);
61+
initializeParameterInput(performanceTestMinQualityFuzzyInput, parsePositiveFloatInput);
62+
initializeParameterInput(performanceTestMinQualitySubstringInput, parsePositiveFloatInput);
63+
initializeParameterInput(performanceTestMinQualityPrefixInput, parsePositiveFloatInput);
5664

5765
wireTableRows();
5866

@@ -206,11 +214,7 @@ async function downloadAndIndexOsmData() {
206214
return;
207215
}
208216

209-
const data = {
210-
entities: entities,
211-
kind: 'osm-places',
212-
latinOnly: false
213-
};
217+
const data = { entities: entities, kind: 'osm-places', latinOnly: false };
214218

215219
indexingRequest.data = data;
216220
indexingRequest.searchDataConfig = self.getSearchDataConfig(data.kind);
@@ -253,7 +257,7 @@ async function generateAndIndexPersonData(reindex = false) {
253257

254258
async function importFaker() {
255259
try {
256-
return await import('../node_modules/@faker-js/faker/dist/esm/index.mjs');
260+
return await import('../node_modules/@faker-js/faker/dist/index.js');
257261
} catch (e) {
258262
console.log(e);
259263
getIndexStatusTextElement().innerHTML += `Import error.`;
@@ -272,11 +276,7 @@ function generateAndIndexPersonDataPart2(indexingRequest, numberOfNames, randomS
272276

273277
const latinOnly = personData.scripts.size === 1 && personData.scripts.has('Latn');
274278

275-
const data = {
276-
entities: entities,
277-
kind: 'persons',
278-
latinOnly: latinOnly
279-
};
279+
const data = { entities: entities, kind: 'persons', latinOnly: latinOnly };
280280

281281
indexingRequest.data = data;
282282
indexingRequest.searchDataConfig = self.getSearchDataConfig(data.kind);
@@ -815,19 +815,28 @@ function runPerformanceTest() {
815815
const randomSeed = performanceTestRandomSeedInput.dataValue;
816816
const numberOfQueries = performanceTestNumberOfQueriesInput.dataValue;
817817
const maxMatches = performanceTestMaxMatchesInput.dataValue;
818-
const minQuality = performanceTestMinQualityInput.dataValue;
818+
const minQualityFuzzy = performanceTestMinQualityFuzzyInput.dataValue;
819+
const minQualitySubstring = performanceTestMinQualitySubstringInput.dataValue;
820+
const minQualityPrefix = performanceTestMinQualityPrefixInput.dataValue;
819821

820822
if (
821823
nullOrUndefined(randomSeed) ||
822824
nullOrUndefined(numberOfQueries) ||
823825
nullOrUndefined(maxMatches) ||
824-
nullOrUndefined(minQuality)
826+
nullOrUndefined(minQualityFuzzy) ||
827+
nullOrUndefined(minQualitySubstring) ||
828+
nullOrUndefined(minQualityPrefix)
825829
) {
826830
renderPerformanceTestResult();
827831
return;
828832
}
829833

830-
const testRunParameters = new fuzzySearch.TestRunParameters(randomSeed, numberOfQueries, maxMatches, minQuality);
834+
const searchers = [
835+
new fuzzySearch.FuzzySearcher(minQualityFuzzy),
836+
new fuzzySearch.SubstringSearcher(minQualitySubstring),
837+
new fuzzySearch.PrefixSearcher(minQualityPrefix)
838+
];
839+
const testRunParameters = new fuzzySearch.TestRunParameters(randomSeed, numberOfQueries, maxMatches, searchers);
831840
const performanceTest = new fuzzySearch.PerformanceTest(currentInstance.searcher);
832841
const report = performanceTest.run(testRunParameters);
833842
renderPerformanceTestResult(report);

demo/fuzzy-search-demo.html

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,16 @@ <h3>Add</h3>
195195
<input type="number" name="max-matches" value="10" />
196196
</label>
197197
<label>
198-
<span>Minimum quality <span class="parameter-error"></span></span>
199-
<input type="number" name="min-quality" value="0.3" />
198+
<span>Fuzzy searcher minimum quality <span class="parameter-error"></span></span>
199+
<input type="number" name="min-quality-fuzzy" value="0.3" />
200+
</label>
201+
<label>
202+
<span>Substring searcher minimum quality <span class="parameter-error"></span></span>
203+
<input type="number" name="min-quality-substring" value="0.0" />
204+
</label>
205+
<label>
206+
<span>Prefix searcher minimum quality <span class="parameter-error"></span></span>
207+
<input type="number" name="min-quality-prefix" value="0.0" />
200208
</label>
201209
</div>
202210
<button type="button" id="performance-test-button">run performance test</button>

0 commit comments

Comments
 (0)