Skip to content

Commit acff03f

Browse files
Simplify the Prism class even more
- Move `tokenize()`, `_matchGrammar()` (and related code), and `resolve()` to separate modules. - Ditch the `tokenize` symbol in favor of the `$tokenize` special property. Since symbols are excluded from the `for…in` loop, and `$tokenize` is not a symbol, to preserve the previous behavior, we need to handle it in `_matchGrammar` explicitly. - Switch from `[tokenize]` to `$tokenize` in languages. --- Co-authored-by: Dmitry Sharabin <[email protected]>
1 parent 1455834 commit acff03f

29 files changed

+342
-326
lines changed

src/core/classes/prism.ts

Lines changed: 4 additions & 249 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
1-
import { rest, tokenize } from '../../shared/symbols';
21
import { highlight } from '../highlight';
32
import { highlightAll } from '../highlight-all';
43
import { highlightElement } from '../highlight-element';
5-
import { LinkedList } from '../linked-list';
64
import { Registry } from '../registry';
5+
import { tokenize } from '../tokenize/tokenize';
76
import { Hooks } from './hooks';
8-
import { Token } from './token';
97
import type { KnownPlugins } from '../../known-plugins';
10-
import type { Grammar, GrammarToken, GrammarTokens, RegExpLike } from '../../types';
8+
import type { Grammar } from '../../types';
119
import type { HighlightOptions } from '../highlight';
1210
import type { HighlightAllOptions } from '../highlight-all';
1311
import type { HighlightElementOptions } from '../highlight-element';
14-
import type { LinkedListHeadNode, LinkedListMiddleNode, LinkedListTailNode } from '../linked-list';
1512
import type { TokenStream } from './token';
1613

1714
/**
@@ -45,253 +42,11 @@ export default class Prism {
4542
}
4643

4744
/**
48-
* This is the heart of Prism, and the most low-level function you can use. It accepts a string of text as input
49-
* and the language definitions to use, and returns an array with the tokenized code.
50-
*
51-
* When the language definition includes nested tokens, the function is called recursively on each of these tokens.
52-
*
53-
* This method could be useful in other contexts as well, as a very crude parser.
54-
*
55-
* @param text A string with the code to be highlighted.
56-
* @param grammar An object containing the tokens to use.
57-
*
58-
* Usually a language definition like `Prism.languages.markup`.
59-
* @returns An array of strings and tokens, a token stream.
60-
* @example
61-
* let code = `var foo = 0;`;
62-
* let tokens = Prism.tokenize(code, Prism.getLanguage('javascript'));
63-
* tokens.forEach(token => {
64-
* if (token instanceof Token && token.type === 'number') {
65-
* console.log(`Found numeric literal: ${token.content}`);
66-
* }
67-
* });
45+
* See {@link tokenize}
6846
*/
6947
tokenize (text: string, grammar: Grammar): TokenStream {
70-
const customTokenize = grammar[tokenize];
71-
if (customTokenize) {
72-
return customTokenize(text, grammar, this);
73-
}
74-
75-
let restGrammar = resolve(this.components, grammar[rest]);
76-
while (restGrammar) {
77-
grammar = { ...grammar, ...restGrammar };
78-
restGrammar = resolve(this.components, restGrammar[rest]);
79-
}
80-
81-
const tokenList = new LinkedList<string | Token>();
82-
tokenList.addAfter(tokenList.head, text);
83-
84-
this._matchGrammar(text, tokenList, grammar, tokenList.head, 0);
85-
86-
return tokenList.toArray();
87-
}
88-
89-
private _matchGrammar (
90-
text: string,
91-
tokenList: LinkedList<string | Token>,
92-
grammar: GrammarTokens,
93-
startNode: LinkedListHeadNode<string | Token> | LinkedListMiddleNode<string | Token>,
94-
startPos: number,
95-
rematch?: RematchOptions
96-
): void {
97-
for (const token in grammar) {
98-
const tokenValue = grammar[token];
99-
if (!grammar.hasOwnProperty(token) || !tokenValue) {
100-
continue;
101-
}
102-
103-
const patterns = Array.isArray(tokenValue) ? tokenValue : [tokenValue];
104-
105-
for (let j = 0; j < patterns.length; ++j) {
106-
if (rematch && rematch.cause === `${token},${j}`) {
107-
return;
108-
}
109-
110-
const patternObj = toGrammarToken(patterns[j]);
111-
let { pattern, lookbehind = false, greedy = false, alias, inside } = patternObj;
112-
const insideGrammar = resolve(this.components, inside);
113-
114-
if (greedy && !pattern.global) {
115-
// Without the global flag, lastIndex won't work
116-
patternObj.pattern = pattern = RegExp(pattern.source, pattern.flags + 'g');
117-
}
118-
119-
for (
120-
// iterate the token list and keep track of the current token/string position
121-
let currentNode = startNode.next, pos = startPos;
122-
currentNode.next !== null;
123-
pos += currentNode.value.length, currentNode = currentNode.next
124-
) {
125-
if (rematch && pos >= rematch.reach) {
126-
break;
127-
}
128-
129-
let str = currentNode.value;
130-
131-
if (tokenList.length > text.length) {
132-
// Something went terribly wrong, ABORT, ABORT!
133-
return;
134-
}
135-
136-
if (str instanceof Token) {
137-
continue;
138-
}
139-
140-
let removeCount = 1; // this is the to parameter of removeBetween
141-
let match;
142-
143-
if (greedy) {
144-
match = matchPattern(pattern, pos, text, lookbehind);
145-
if (!match || match.index >= text.length) {
146-
break;
147-
}
148-
149-
const from = match.index;
150-
const to = match.index + match[0].length;
151-
let p = pos;
152-
153-
// find the node that contains the match
154-
p += currentNode.value.length;
155-
while (from >= p) {
156-
currentNode = currentNode.next;
157-
if (currentNode.next === null) {
158-
throw new Error(
159-
'The linked list and the actual text have become de-synced'
160-
);
161-
}
162-
p += currentNode.value.length;
163-
}
164-
// adjust pos (and p)
165-
p -= currentNode.value.length;
166-
pos = p;
167-
168-
// the current node is a Token, then the match starts inside another Token, which is invalid
169-
if (currentNode.value instanceof Token) {
170-
continue;
171-
}
172-
173-
// find the last node which is affected by this match
174-
let k:
175-
| LinkedListMiddleNode<Token | string>
176-
| LinkedListTailNode<Token | string> = currentNode;
177-
for (
178-
;
179-
k.next !== null && (p < to || typeof k.value === 'string');
180-
k = k.next
181-
) {
182-
removeCount++;
183-
p += k.value.length;
184-
}
185-
removeCount--;
186-
187-
// replace with the new match
188-
str = text.slice(pos, p);
189-
match.index -= pos;
190-
}
191-
else {
192-
match = matchPattern(pattern, 0, str, lookbehind);
193-
if (!match) {
194-
continue;
195-
}
196-
}
197-
198-
// eslint-disable-next-line no-redeclare
199-
const from = match.index;
200-
const matchStr = match[0];
201-
const before = str.slice(0, from);
202-
const after = str.slice(from + matchStr.length);
203-
204-
const reach = pos + str.length;
205-
if (rematch && reach > rematch.reach) {
206-
rematch.reach = reach;
207-
}
208-
209-
let removeFrom = currentNode.prev;
210-
211-
if (before) {
212-
removeFrom = tokenList.addAfter(removeFrom, before);
213-
pos += before.length;
214-
}
215-
216-
tokenList.removeRange(removeFrom, removeCount);
217-
218-
const wrapped = new Token(
219-
token,
220-
insideGrammar ? this.tokenize(matchStr, insideGrammar) : matchStr,
221-
alias,
222-
matchStr
223-
);
224-
currentNode = tokenList.addAfter(removeFrom, wrapped);
225-
226-
if (after) {
227-
tokenList.addAfter(currentNode, after);
228-
}
229-
230-
if (removeCount > 1) {
231-
// at least one Token object was removed, so we have to do some rematching
232-
// this can only happen if the current pattern is greedy
233-
234-
const nestedRematch: RematchOptions = {
235-
cause: `${token},${j}`,
236-
reach,
237-
};
238-
this._matchGrammar(
239-
text,
240-
tokenList,
241-
grammar,
242-
currentNode.prev,
243-
pos,
244-
nestedRematch
245-
);
246-
247-
// the reach might have been extended because of the rematching
248-
if (rematch && nestedRematch.reach > rematch.reach) {
249-
rematch.reach = nestedRematch.reach;
250-
}
251-
}
252-
}
253-
}
254-
}
255-
}
256-
}
257-
258-
interface RematchOptions {
259-
cause: string;
260-
reach: number;
261-
}
262-
263-
function matchPattern (pattern: RegExp, pos: number, text: string, lookbehind: boolean) {
264-
pattern.lastIndex = pos;
265-
const match = pattern.exec(text);
266-
if (match && lookbehind && match[1]) {
267-
// change the match to remove the text matched by the Prism lookbehind group
268-
const lookbehindLength = match[1].length;
269-
match.index += lookbehindLength;
270-
match[0] = match[0].slice(lookbehindLength);
271-
}
272-
return match;
273-
}
274-
275-
function toGrammarToken (pattern: GrammarToken | RegExpLike): GrammarToken {
276-
if (!pattern.pattern) {
277-
return { pattern };
278-
}
279-
else {
280-
return pattern;
281-
}
282-
}
283-
284-
function resolve (
285-
components: Registry,
286-
reference: Grammar | string | null | undefined
287-
): Grammar | undefined {
288-
if (reference) {
289-
if (typeof reference === 'string') {
290-
return components.getLanguage(reference);
291-
}
292-
return reference;
48+
return tokenize.call(this, text, grammar);
29349
}
294-
return undefined;
29550
}
29651

29752
export type { Prism };

0 commit comments

Comments
 (0)