-
Notifications
You must be signed in to change notification settings - Fork 505
Improving the tokenizer #2744
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improving the tokenizer #2744
Changes from 14 commits
48cc771
1c8351f
fe6ffb6
bc21332
1bb4510
5075bd2
2ed4454
90a2a3c
fafa21a
efcbae5
712584f
a32613a
3603a54
c1c5bce
e72b381
00cef66
fac1cfa
6199727
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,12 +21,62 @@ const index = new MiniSearch({ | |
| processTerm: (term) => { | ||
| let word = stopWords.has(term) ? null : term; | ||
| if (word) { | ||
| word = word.replace(/^[^a-zA-Z0-9@!]+/, "").replace(/[^a-zA-Z0-9@!]+$/, ""); | ||
| word = word.toLowerCase(); | ||
| // custom trimmer that doesn't strip (@,!,+, -, *,/,^,&, |, %,<, >, =, :, .) which are used in julia macro,function names and identifiers | ||
| word = word | ||
| .replace(/^[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+/, "") | ||
| .replace(/[^a-zA-Z0-9@!+\-/*^&%|<>._=:]+$/, ""); | ||
|
|
||
| word = word.toLowerCase(); | ||
| } | ||
|
|
||
| return word ?? null; | ||
| }, | ||
| tokenize: (string) => string.split(/[\s\-\.]+/), | ||
| tokenize: (string) => { | ||
| const tokens = []; | ||
| let remaining = string; | ||
|
|
||
| // julia specific patterns | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Huh, it seems a lot of code is duplicated here (and hence might get out of sync) ? I wonder if there is a way we could avoid that, e.g. putting shared code into a separate file that is included in both places? (Of course this goes way beyond the scope of this PR, i.e., I am not asking you to take care of this here, I just wanted to put the thought out there. |
||
| const patterns = [ | ||
| // Module qualified names (e.g., Base.sort, Module.Submodule. function) | ||
Rahban1 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| /\b[A-Za-z0-9_1*(?:\.[A-Z][A-Za-z0-9_1*)*\.[a-z_][A-Za-z0-9_!]*\b/g, | ||
| // Macro calls (e.g., @time, @async) | ||
| /@[A-Za-z0-9_]*/g, | ||
| // Type parameters (e.g., Array{T,N}, Vector{Int}) | ||
| /\b[A-Za-z0-9_]*\{[^}]+\}/g, | ||
| // Function names with module qualification (e.g., Base.+, Base.:^) | ||
| /\b[A-Za-z0-9_]*\.:[A-Za-z0-9_!+\-*/^&|%<>=.]+/g, | ||
| // Operators as complete tokens (e.g., !=, aã, ||, ^, .=, →) | ||
| /[!<>=+\-*/^&|%:.]+/g, | ||
| // Function signatures with type annotations (e.g., f(x::Int)) | ||
| /\b[A-Za-z0-9_!]*\([^)]*::[^)]*\)/g, | ||
| // Numbers (integers, floats,scientific notation) | ||
Rahban1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| /\b\d+(?:\.\d+)? (?:[eE][+-]?\d+)?\b/g, | ||
| ]; | ||
|
|
||
| // apply patterns in order of specificity | ||
| for (const pattern of patterns) { | ||
| pattern.lastIndex = 0; //reset regex state | ||
| let match; | ||
| while ((match = pattern.exec(remaining)) != null) { | ||
| const token = match[0].trim(); | ||
| if (token && !tokens.includes(token)) { | ||
| tokens.push(token); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // splitting the content if something remains | ||
| const basicTokens = remaining | ||
| .split(/[\s\-,;()[\]{}]+/) | ||
| .filter((t) => t.trim()); | ||
| for (const token of basicTokens) { | ||
| if (token && !tokens.includes(token)) { | ||
| tokens.push(token); | ||
| } | ||
| } | ||
|
|
||
| return tokens.filter((token) => token.length > 0); | ||
| }, | ||
| searchOptions: { prefix: true, boost: { title: 100 }, fuzzy: 2 } | ||
| }); | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the
Changedsection should be after the "Added" section.But in any case, this is in the wrong place here, we are already beyond 1.15.0 :-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took the liberty of moving this to the right place.