Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Version [v1.13.0] - 2025-06-19

### Changed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the Changed section should be after the "Added" section.

But in any case, this is in the wrong place here, we are already beyond 1.15.0 :-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took the liberty of moving this to the right place.


* Improved the search tokenizer and custom trimmer to improve search results. ([#2744])

### Added

* Added new type `RawHTMLHeadContent` to `HTML` format object, which allows to add raw HTML to the head of the HTML output, by passing it as a element in the `assets` keyword argument. ([#2726])
Expand Down Expand Up @@ -2129,6 +2133,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[#2726]: https://github.com/JuliaDocs/Documenter.jl/issues/2726
[#2729]: https://github.com/JuliaDocs/Documenter.jl/issues/2729
[#2737]: https://github.com/JuliaDocs/Documenter.jl/issues/2737
[#2744]: https://github.com/JuliaDocs/Documenter.jl/issues/2744
[#2748]: https://github.com/JuliaDocs/Documenter.jl/issues/2748
[#2750]: https://github.com/JuliaDocs/Documenter.jl/issues/2750
[JuliaLang/julia#36953]: https://github.com/JuliaLang/julia/issues/36953
Expand Down
91 changes: 87 additions & 4 deletions assets/html/js/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,10 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
processTerm: (term) => {
let word = stopWords.has(term) ? null : term;
if (word) {
// custom trimmer that doesn't strip @ and !, which are used in julia macro and function names
// custom trimmer that doesn't strip special characters `@!+-*/^&|%<>=:.` which are used in julia macro and function names.
word = word
.replace(/^[^a-zA-Z0-9@!]+/, "")
.replace(/[^a-zA-Z0-9@!]+$/, "");
.replace(/^[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+/, "")
.replace(/[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+$/, "");

word = word.toLowerCase();
}
Expand All @@ -204,7 +204,52 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
},
// add . as a separator, because otherwise "title": "Documenter.Anchors.add!", would not
// find anything if searching for "add!", only for the entire qualification
tokenize: (string) => string.split(/[\s\-\.]+/),
tokenize: (string) => {
const tokens = [];
let remaining = string;

// julia specific patterns
const patterns = [
// Module qualified names (e.g., Base.sort, Module.Submodule. function)
/\b[A-Za-z0-9_1*(?:\.[A-Z][A-Za-z0-9_1*)*\.[a-z_][A-Za-z0-9_!]*\b/g,
// Macro calls (e.g., @time, @async)
/@[A-Za-z0-9_]*/g,
// Type parameters (e.g., Array{T,N}, Vector{Int})
/\b[A-Za-z0-9_]*\{[^}]+\}/g,
// Function names with module qualification (e.g., Base.+, Base.:^)
/\b[A-Za-z0-9_]*\.:[A-Za-z0-9_!+\-*/^&|%<>=.]+/g,
// Operators as complete tokens (e.g., !=, aã, ||, ^, .=, →)
/[!<>=+\-*/^&|%:.]+/g,
// Function signatures with type annotations (e.g., f(x::Int))
/\b[A-Za-z0-9_!]*\([^)]*::[^)]*\)/g,
// Numbers (integers, floats,scientific notation)
/\b\d+(?:\.\d+)? (?:[eE][+-]?\d+)?\b/g,
];

// apply patterns in order of specificity
for (const pattern of patterns) {
pattern.lastIndex = 0; //reset regex state
let match;
while ((match = pattern.exec(remaining)) != null) {
const token = match[0].trim();
if (token && !tokens.includes(token)) {
tokens.push(token);
}
}
}

// splitting the content if something remains
const basicTokens = remaining
.split(/[\s\-,;()[\]{}]+/)
.filter((t) => t.trim());
for (const token of basicTokens) {
if (token && !tokens.includes(token)) {
tokens.push(token);
}
}

return tokens.filter((token) => token.length > 0);
},
// options which will be applied during the search
searchOptions: {
prefix: true,
Expand Down Expand Up @@ -327,6 +372,35 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
return result_div;
}

function calculateCustomScore(result, query) {
const titleLower = result.title.toLowerCase();
const queryLower = query.toLowerCase();

// Tier 1 : Exact title match
if (titleLower == queryLower) {
return 10000 + result.score;
}

// Tier 2 : Title contains exact query
if (titleLower.includes(queryLower)) {
const position = titleLower.indexOf(queryLower);
// prefer matches at the beginning
return 5000 + result.score - position * 10;
}

// Tier 3 : All query words in title
const queryWords = queryLower.trim().split(/\s+/);
const titleWords = titleLower.trim().split(/\s+/);
const allWordsInTitle = queryWords.every((qw) =>
titleWords.some((tw) => tw.includes(qw)),
);
if (allWordsInTitle) {
return 2000 + result.score;
}

return result.score;
}

self.onmessage = function (e) {
let query = e.data;
let results = index.search(query, {
Expand All @@ -337,6 +411,15 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) {
combineWith: "AND",
});

// calculate custom scores for all results
results = results.map((result) => ({
...result,
customScore: calculateCustomScore(result, query),
}));

// sort by custom score in descending order
results.sort((a, b) => b.customScore - a.customScore);

// Pre-filter to deduplicate and limit to 200 per category to the extent
// possible without knowing what the filters are.
let filtered_results = [];
Expand Down
56 changes: 53 additions & 3 deletions test/search/wrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,62 @@ const index = new MiniSearch({
processTerm: (term) => {
let word = stopWords.has(term) ? null : term;
if (word) {
word = word.replace(/^[^a-zA-Z0-9@!]+/, "").replace(/[^a-zA-Z0-9@!]+$/, "");
word = word.toLowerCase();
// custom trimmer that doesn't strip (@,!,+, -, *,/,^,&, |, %,<, >, =, :, .) which are used in julia macro,function names and identifiers
word = word
.replace(/^[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+/, "")
.replace(/[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+$/, "");

word = word.toLowerCase();
}

return word ?? null;
},
tokenize: (string) => string.split(/[\s\-\.]+/),
tokenize: (string) => {
const tokens = [];
let remaining = string;

// julia specific patterns
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huh, it seems a lot of code is duplicated here (and hence might get out of sync) ? I wonder if there is a way we could avoid that, e.g. putting shared code into a separate file that is included in both places?

(Of course this goes way beyond the scope of this PR, i.e., I am not asking you to take care of this here, I just wanted to put the thought out there.

const patterns = [
// Module qualified names (e.g., Base.sort, Module.Submodule. function)
/\b[A-Za-z0-9_1*(?:\.[A-Z][A-Za-z0-9_1*)*\.[a-z_][A-Za-z0-9_!]*\b/g,
// Macro calls (e.g., @time, @async)
/@[A-Za-z0-9_]*/g,
// Type parameters (e.g., Array{T,N}, Vector{Int})
/\b[A-Za-z0-9_]*\{[^}]+\}/g,
// Function names with module qualification (e.g., Base.+, Base.:^)
/\b[A-Za-z0-9_]*\.:[A-Za-z0-9_!+\-*/^&|%<>=.]+/g,
// Operators as complete tokens (e.g., !=, aã, ||, ^, .=, →)
/[!<>=+\-*/^&|%:.]+/g,
// Function signatures with type annotations (e.g., f(x::Int))
/\b[A-Za-z0-9_!]*\([^)]*::[^)]*\)/g,
// Numbers (integers, floats,scientific notation)
/\b\d+(?:\.\d+)? (?:[eE][+-]?\d+)?\b/g,
];

// apply patterns in order of specificity
for (const pattern of patterns) {
pattern.lastIndex = 0; //reset regex state
let match;
while ((match = pattern.exec(remaining)) != null) {
const token = match[0].trim();
if (token && !tokens.includes(token)) {
tokens.push(token);
}
}
}

// splitting the content if something remains
const basicTokens = remaining
.split(/[\s\-,;()[\]{}]+/)
.filter((t) => t.trim());
for (const token of basicTokens) {
if (token && !tokens.includes(token)) {
tokens.push(token);
}
}

return tokens.filter((token) => token.length > 0);
},
searchOptions: { prefix: true, boost: { title: 100 }, fuzzy: 2 }
});

Expand Down
Loading