Skip to content

Commit f808909

Browse files
authored
Improving the tokenizer (#2744)
1 parent 1d8b68f commit f808909

File tree

3 files changed

+142
-7
lines changed

3 files changed

+142
-7
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414

1515
* Page category is removed from the search index and now everything is in section category. ([#2762], [#2413])
1616
* Changed the docstring block accordions from a custom implementation to HTML details+summary tag. ([#2772], [#2773])
17+
* Improved the search tokenizer and custom trimmer to improve search results. ([#1457], [#2114], [#2744])
1718

1819
### Fixed
1920

@@ -2165,6 +2166,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
21652166
[#2726]: https://github.com/JuliaDocs/Documenter.jl/issues/2726
21662167
[#2729]: https://github.com/JuliaDocs/Documenter.jl/issues/2729
21672168
[#2737]: https://github.com/JuliaDocs/Documenter.jl/issues/2737
2169+
[#2744]: https://github.com/JuliaDocs/Documenter.jl/issues/2744
21682170
[#2748]: https://github.com/JuliaDocs/Documenter.jl/issues/2748
21692171
[#2750]: https://github.com/JuliaDocs/Documenter.jl/issues/2750
21702172
[#2751]: https://github.com/JuliaDocs/Documenter.jl/issues/2751

assets/html/js/search.js

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,10 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
192192
processTerm: (term) => {
193193
let word = stopWords.has(term) ? null : term;
194194
if (word) {
195-
// custom trimmer that doesn't strip @ and !, which are used in julia macro and function names
195+
// custom trimmer that doesn't strip special characters `@!+-*/^&|%<>=:.` which are used in julia macro and function names.
196196
word = word
197-
.replace(/^[^a-zA-Z0-9@!]+/, "")
198-
.replace(/[^a-zA-Z0-9@!]+$/, "");
197+
.replace(/^[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+/, "")
198+
.replace(/[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+$/, "");
199199

200200
word = word.toLowerCase();
201201
}
@@ -204,7 +204,52 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
204204
},
205205
// add . as a separator, because otherwise "title": "Documenter.Anchors.add!", would not
206206
// find anything if searching for "add!", only for the entire qualification
207-
tokenize: (string) => string.split(/[\s\-\.]+/),
207+
tokenize: (string) => {
208+
const tokens = [];
209+
let remaining = string;
210+
211+
// julia specific patterns
212+
const patterns = [
213+
// Module qualified names (e.g., Base.sort, Module.Submodule. function)
214+
/\b[A-Za-z0-9_1*(?:\.[A-Z][A-Za-z0-9_1*)*\.[a-z_][A-Za-z0-9_!]*\b/g,
215+
// Macro calls (e.g., @time, @async)
216+
/@[A-Za-z0-9_]*/g,
217+
// Type parameters (e.g., Array{T,N}, Vector{Int})
218+
/\b[A-Za-z0-9_]*\{[^}]+\}/g,
219+
// Function names with module qualification (e.g., Base.+, Base.:^)
220+
/\b[A-Za-z0-9_]*\.:[A-Za-z0-9_!+\-*/^&|%<>=.]+/g,
221+
// Operators as complete tokens (e.g., !=, aã, ||, ^, .=, →)
222+
/[!<>=+\-*/^&|%:.]+/g,
223+
// Function signatures with type annotations (e.g., f(x::Int))
224+
/\b[A-Za-z0-9_!]*\([^)]*::[^)]*\)/g,
225+
// Numbers (integers, floats, scientific notation)
226+
/\b\d+(?:\.\d+)? (?:[eE][+-]?\d+)?\b/g,
227+
];
228+
229+
// apply patterns in order of specificity
230+
for (const pattern of patterns) {
231+
pattern.lastIndex = 0; //reset regex state
232+
let match;
233+
while ((match = pattern.exec(remaining)) != null) {
234+
const token = match[0].trim();
235+
if (token && !tokens.includes(token)) {
236+
tokens.push(token);
237+
}
238+
}
239+
}
240+
241+
// splitting the content if something remains
242+
const basicTokens = remaining
243+
.split(/[\s\-,;()[\]{}]+/)
244+
.filter((t) => t.trim());
245+
for (const token of basicTokens) {
246+
if (token && !tokens.includes(token)) {
247+
tokens.push(token);
248+
}
249+
}
250+
251+
return tokens.filter((token) => token.length > 0);
252+
},
208253
// options which will be applied during the search
209254
searchOptions: {
210255
prefix: true,
@@ -327,6 +372,35 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
327372
return result_div;
328373
}
329374

375+
function calculateCustomScore(result, query) {
376+
const titleLower = result.title.toLowerCase();
377+
const queryLower = query.toLowerCase();
378+
379+
// Tier 1 : Exact title match
380+
if (titleLower == queryLower) {
381+
return 10000 + result.score;
382+
}
383+
384+
// Tier 2 : Title contains exact query
385+
if (titleLower.includes(queryLower)) {
386+
const position = titleLower.indexOf(queryLower);
387+
// prefer matches at the beginning
388+
return 5000 + result.score - position * 10;
389+
}
390+
391+
// Tier 3 : All query words in title
392+
const queryWords = queryLower.trim().split(/\s+/);
393+
const titleWords = titleLower.trim().split(/\s+/);
394+
const allWordsInTitle = queryWords.every((qw) =>
395+
titleWords.some((tw) => tw.includes(qw)),
396+
);
397+
if (allWordsInTitle) {
398+
return 2000 + result.score;
399+
}
400+
401+
return result.score;
402+
}
403+
330404
self.onmessage = function (e) {
331405
let query = e.data;
332406
let results = index.search(query, {
@@ -337,6 +411,15 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
337411
combineWith: "AND",
338412
});
339413

414+
// calculate custom scores for all results
415+
results = results.map((result) => ({
416+
...result,
417+
customScore: calculateCustomScore(result, query),
418+
}));
419+
420+
// sort by custom score in descending order
421+
results.sort((a, b) => b.customScore - a.customScore);
422+
340423
// Pre-filter to deduplicate and limit to 200 per category to the extent
341424
// possible without knowing what the filters are.
342425
let filtered_results = [];

test/search/wrapper.js

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,62 @@ const index = new MiniSearch({
2121
processTerm: (term) => {
2222
let word = stopWords.has(term) ? null : term;
2323
if (word) {
24-
word = word.replace(/^[^a-zA-Z0-9@!]+/, "").replace(/[^a-zA-Z0-9@!]+$/, "");
25-
word = word.toLowerCase();
24+
// custom trimmer that doesn't strip (@,!,+, -, *,/,^,&, |, %,<, >, =, :, .) which are used in julia macro,function names and identifiers
25+
word = word
26+
.replace(/^[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+/, "")
27+
.replace(/[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+$/, "");
28+
29+
word = word.toLowerCase();
2630
}
31+
2732
return word ?? null;
2833
},
29-
tokenize: (string) => string.split(/[\s\-\.]+/),
34+
tokenize: (string) => {
35+
const tokens = [];
36+
let remaining = string;
37+
38+
// julia specific patterns
39+
const patterns = [
40+
// Module qualified names (e.g., Base.sort or more generally Module.Submodule.function)
41+
/\b[A-Za-z0-9_1*(?:\.[A-Z][A-Za-z0-9_1*)*\.[a-z_][A-Za-z0-9_!]*\b/g,
42+
// Macro calls (e.g., @time, @async)
43+
/@[A-Za-z0-9_]*/g,
44+
// Type parameters (e.g., Array{T,N}, Vector{Int})
45+
/\b[A-Za-z0-9_]*\{[^}]+\}/g,
46+
// Function names with module qualification (e.g., Base.+, Base.:^)
47+
/\b[A-Za-z0-9_]*\.:[A-Za-z0-9_!+\-*/^&|%<>=.]+/g,
48+
// Operators as complete tokens (e.g., !=, aã, ||, ^, .=, →)
49+
/[!<>=+\-*/^&|%:.]+/g,
50+
// Function signatures with type annotations (e.g., f(x::Int))
51+
/\b[A-Za-z0-9_!]*\([^)]*::[^)]*\)/g,
52+
// Numbers (integers, floats,scientific notation)
53+
/\b\d+(?:\.\d+)? (?:[eE][+-]?\d+)?\b/g,
54+
];
55+
56+
// apply patterns in order of specificity
57+
for (const pattern of patterns) {
58+
pattern.lastIndex = 0; //reset regex state
59+
let match;
60+
while ((match = pattern.exec(remaining)) != null) {
61+
const token = match[0].trim();
62+
if (token && !tokens.includes(token)) {
63+
tokens.push(token);
64+
}
65+
}
66+
}
67+
68+
// splitting the content if something remains
69+
const basicTokens = remaining
70+
.split(/[\s\-,;()[\]{}]+/)
71+
.filter((t) => t.trim());
72+
for (const token of basicTokens) {
73+
if (token && !tokens.includes(token)) {
74+
tokens.push(token);
75+
}
76+
}
77+
78+
return tokens.filter((token) => token.length > 0);
79+
},
3080
searchOptions: { prefix: true, boost: { title: 100 }, fuzzy: 2 }
3181
});
3282

0 commit comments

Comments
 (0)