@@ -3,8 +3,20 @@ import { equalizeWhitespace, escapeRegExp, id } from 'src/utils';
33import * as regexFactory from './regexFactory' ;
44import { type Token , TokenType } from './token' ;
55
6- export const WHITESPACE_REGEX = / ^ ( \s + ) / u;
7- const NULL_REGEX = / (? ! ) / ; // zero-width negative lookahead, matches nothing
6+ // A note about regular expressions
7+ //
8+ // We're using a sticky flag "y" in all tokenizing regexes.
9+ // This works a bit like ^, anchoring the regex to the start,
10+ // but when ^ anchores the regex to the start of string (or line),
11+ // the sticky flag anchors it to search start position, which we
12+ // can change by setting RegExp.lastIndex.
13+ //
14+ // This allows us to avoid slicing off tokens from the start of input string
15+ // (which we used in the past) and just move the match start position forward,
16+ // which is much more performant on long strings.
17+
18+ const WHITESPACE_REGEX = / ( \s + ) / uy;
19+ const NULL_REGEX = / (? ! ) / uy; // zero-width negative lookahead, matches nothing
820
921const toCanonicalKeyword = ( text : string ) => equalizeWhitespace ( text . toUpperCase ( ) ) ;
1022
@@ -68,6 +80,10 @@ export default class Tokenizer {
6880 private REGEX_MAP : Record < TokenType , RegExp > ;
6981 private quotedIdentRegex : RegExp ;
7082 private paramPatterns : ParamPattern [ ] ;
83+ // The input SQL string to process
84+ private input = '' ;
85+ // Current position in string
86+ private index = 0 ;
7187
7288 private preprocess = ( tokens : Token [ ] ) => tokens ;
7389
@@ -117,12 +133,12 @@ export default class Tokenizer {
117133 ] ) ,
118134 [ TokenType . BLOCK_START ] : regexFactory . createParenRegex ( cfg . blockStart ?? [ '(' ] ) ,
119135 [ TokenType . BLOCK_END ] : regexFactory . createParenRegex ( cfg . blockEnd ?? [ ')' ] ) ,
120- [ TokenType . RESERVED_CASE_START ] : / ^ ( C A S E ) \b / iu ,
121- [ TokenType . RESERVED_CASE_END ] : / ^ ( E N D ) \b / iu ,
136+ [ TokenType . RESERVED_CASE_START ] : / ( C A S E ) \b / iuy ,
137+ [ TokenType . RESERVED_CASE_END ] : / ( E N D ) \b / iuy ,
122138 [ TokenType . LINE_COMMENT ] : regexFactory . createLineCommentRegex ( cfg . lineCommentTypes ?? [ '--' ] ) ,
123- [ TokenType . BLOCK_COMMENT ] : / ^ ( \/ \* [ ^ ] * ?(?: \* \/ | $ ) ) / u ,
139+ [ TokenType . BLOCK_COMMENT ] : / ( \/ \* [ ^ ] * ?(?: \* \/ | $ ) ) / uy ,
124140 [ TokenType . NUMBER ] :
125- / ^ ( 0 x [ 0 - 9 a - f A - F ] + | 0 b [ 0 1 ] + | ( - \s * ) ? [ 0 - 9 ] + ( \. [ 0 - 9 ] * ) ? ( [ e E ] [ - + ] ? [ 0 - 9 ] + ( \. [ 0 - 9 ] + ) ? ) ? ) / u ,
141+ / ( 0 x [ 0 - 9 a - f A - F ] + | 0 b [ 0 1 ] + | ( - \s * ) ? [ 0 - 9 ] + ( \. [ 0 - 9 ] * ) ? ( [ e E ] [ - + ] ? [ 0 - 9 ] + ( \. [ 0 - 9 ] + ) ? ) ? ) / uy ,
126142 [ TokenType . PARAMETER ] : NULL_REGEX , // matches nothing
127143 [ TokenType . EOF ] : NULL_REGEX , // matches nothing
128144 } ;
@@ -152,7 +168,7 @@ export default class Tokenizer {
152168 } ,
153169 {
154170 // ? placeholders
155- regex : cfg . positionalParams ? / ^ ( \? ) / : undefined ,
171+ regex : cfg . positionalParams ? / ( \? ) / uy : undefined ,
156172 parseKey : v => v . slice ( 1 ) ,
157173 } ,
158174 ] ) ;
@@ -172,62 +188,61 @@ export default class Tokenizer {
172188 * @returns {Token[] } output token stream
173189 */
174190 public tokenize ( input : string ) : Token [ ] {
191+ this . input = input ;
192+ this . index = 0 ;
175193 const tokens : Token [ ] = [ ] ;
176194 let token : Token | undefined ;
177195
178- // Keep processing the string until it is empty
179- while ( input . length ) {
196+ // Keep processing the string until end is reached
197+ while ( this . index < this . input . length ) {
180198 // grab any preceding whitespace
181- const whitespaceBefore = this . getWhitespace ( input ) ;
182- input = input . substring ( whitespaceBefore . length ) ;
199+ const whitespaceBefore = this . getWhitespace ( ) ;
183200
184- if ( input . length ) {
201+ if ( this . index < this . input . length ) {
185202 // Get the next token and the token type
186- token = this . getNextToken ( input , token ) ;
203+ token = this . getNextToken ( token ) ;
187204 if ( ! token ) {
188- throw new Error ( `Parse error: Unexpected "${ input . slice ( 0 , 100 ) } "` ) ;
205+ throw new Error ( `Parse error: Unexpected "${ input . slice ( this . index , 100 ) } "` ) ;
189206 }
190- // Advance the string
191- input = input . substring ( token . text . length ) ;
192207
193208 tokens . push ( { ...token , whitespaceBefore } ) ;
194209 }
195210 }
196211 return this . preprocess ( tokens ) ;
197212 }
198213
199- /** Matches preceding whitespace if present */
200- private getWhitespace ( input : string ) : string {
201- const matches = input . match ( WHITESPACE_REGEX ) ;
202- return matches ? matches [ 1 ] : '' ;
214+ private getWhitespace ( ) : string {
215+ WHITESPACE_REGEX . lastIndex = this . index ;
216+ const matches = this . input . match ( WHITESPACE_REGEX ) ;
217+ if ( matches ) {
218+ // Advance current position by matched whitespace length
219+ this . index += matches [ 1 ] . length ;
220+ return matches [ 1 ] ;
221+ } else {
222+ return '' ;
223+ }
203224 }
204225
205- /** Attempts to match next token from input string, tests RegExp patterns in decreasing priority */
206- private getNextToken ( input : string , previousToken ?: Token ) : Token | undefined {
226+ private getNextToken ( previousToken ?: Token ) : Token | undefined {
207227 return (
208- this . matchToken ( TokenType . LINE_COMMENT , input ) ||
209- this . matchToken ( TokenType . BLOCK_COMMENT , input ) ||
210- this . matchToken ( TokenType . STRING , input ) ||
211- this . matchQuotedIdentToken ( input ) ||
212- this . matchToken ( TokenType . VARIABLE , input ) ||
213- this . matchToken ( TokenType . BLOCK_START , input ) ||
214- this . matchToken ( TokenType . BLOCK_END , input ) ||
215- this . matchPlaceholderToken ( input ) ||
216- this . matchToken ( TokenType . NUMBER , input ) ||
217- this . matchReservedWordToken ( input , previousToken ) ||
218- this . matchToken ( TokenType . IDENT , input ) ||
219- this . matchToken ( TokenType . OPERATOR , input )
228+ this . matchToken ( TokenType . LINE_COMMENT ) ||
229+ this . matchToken ( TokenType . BLOCK_COMMENT ) ||
230+ this . matchToken ( TokenType . STRING ) ||
231+ this . matchQuotedIdentToken ( ) ||
232+ this . matchToken ( TokenType . VARIABLE ) ||
233+ this . matchToken ( TokenType . BLOCK_START ) ||
234+ this . matchToken ( TokenType . BLOCK_END ) ||
235+ this . matchPlaceholderToken ( ) ||
236+ this . matchToken ( TokenType . NUMBER ) ||
237+ this . matchReservedWordToken ( previousToken ) ||
238+ this . matchToken ( TokenType . IDENT ) ||
239+ this . matchToken ( TokenType . OPERATOR )
220240 ) ;
221241 }
222242
223- /**
224- * Attempts to match a placeholder token pattern
225- * @return {Token | undefined } - The placeholder token if found, otherwise undefined
226- */
227- private matchPlaceholderToken ( input : string ) : Token | undefined {
243+ private matchPlaceholderToken ( ) : Token | undefined {
228244 for ( const { regex, parseKey } of this . paramPatterns ) {
229245 const token = this . match ( {
230- input,
231246 regex,
232247 type : TokenType . PARAMETER ,
233248 transform : id ,
@@ -243,20 +258,15 @@ export default class Tokenizer {
243258 return key . replace ( new RegExp ( escapeRegExp ( '\\' + quoteChar ) , 'gu' ) , quoteChar ) ;
244259 }
245260
246- private matchQuotedIdentToken ( input : string ) : Token | undefined {
261+ private matchQuotedIdentToken ( ) : Token | undefined {
247262 return this . match ( {
248- input,
249263 regex : this . quotedIdentRegex ,
250264 type : TokenType . IDENT ,
251265 transform : id ,
252266 } ) ;
253267 }
254268
255- /**
256- * Attempts to match a Reserved word token pattern, avoiding edge cases of Reserved words within string tokens
257- * @return {Token | undefined } - The Reserved word token if found, otherwise undefined
258- */
259- private matchReservedWordToken ( input : string , previousToken ?: Token ) : Token | undefined {
269+ private matchReservedWordToken ( previousToken ?: Token ) : Token | undefined {
260270 // A reserved word cannot be preceded by a '.'
261271 // this makes it so in "mytable.from", "from" is not considered a reserved word
262272 if ( previousToken ?. value === '.' ) {
@@ -265,57 +275,50 @@ export default class Tokenizer {
265275
266276 // prioritised list of Reserved token types
267277 return (
268- this . matchReservedToken ( TokenType . RESERVED_CASE_START , input ) ||
269- this . matchReservedToken ( TokenType . RESERVED_CASE_END , input ) ||
270- this . matchReservedToken ( TokenType . RESERVED_COMMAND , input ) ||
271- this . matchReservedToken ( TokenType . RESERVED_BINARY_COMMAND , input ) ||
272- this . matchReservedToken ( TokenType . RESERVED_DEPENDENT_CLAUSE , input ) ||
273- this . matchReservedToken ( TokenType . RESERVED_LOGICAL_OPERATOR , input ) ||
274- this . matchReservedToken ( TokenType . RESERVED_KEYWORD , input ) ||
275- this . matchReservedToken ( TokenType . RESERVED_JOIN_CONDITION , input )
278+ this . matchReservedToken ( TokenType . RESERVED_CASE_START ) ||
279+ this . matchReservedToken ( TokenType . RESERVED_CASE_END ) ||
280+ this . matchReservedToken ( TokenType . RESERVED_COMMAND ) ||
281+ this . matchReservedToken ( TokenType . RESERVED_BINARY_COMMAND ) ||
282+ this . matchReservedToken ( TokenType . RESERVED_DEPENDENT_CLAUSE ) ||
283+ this . matchReservedToken ( TokenType . RESERVED_LOGICAL_OPERATOR ) ||
284+ this . matchReservedToken ( TokenType . RESERVED_KEYWORD ) ||
285+ this . matchReservedToken ( TokenType . RESERVED_JOIN_CONDITION )
276286 ) ;
277287 }
278288
279289 // Helper for matching RESERVED_* tokens which need to be transformed to canonical form
280- private matchReservedToken ( tokenType : TokenType , input : string ) : Token | undefined {
290+ private matchReservedToken ( tokenType : TokenType ) : Token | undefined {
281291 return this . match ( {
282- input,
283292 type : tokenType ,
284293 regex : this . REGEX_MAP [ tokenType ] ,
285294 transform : toCanonicalKeyword ,
286295 } ) ;
287296 }
288297
289298 // Shorthand for `match` that looks up regex from REGEX_MAP
290- private matchToken ( tokenType : TokenType , input : string ) : Token | undefined {
299+ private matchToken ( tokenType : TokenType ) : Token | undefined {
291300 return this . match ( {
292- input,
293301 type : tokenType ,
294302 regex : this . REGEX_MAP [ tokenType ] ,
295303 transform : id ,
296304 } ) ;
297305 }
298306
299- /**
300- * Attempts to match RegExp from head of input, returning undefined if not found
301- * @param {string } _.input - The string to match
302- * @param {TokenType } _.type - The type of token to match against
303- * @param {RegExp } _.regex - The regex to match
304- * @return {Token | undefined } - The matched token if found, otherwise undefined
305- */
307+ // Attempts to match RegExp at current position in input
306308 private match ( {
307- input,
308309 type,
309310 regex,
310311 transform,
311312 } : {
312- input : string ;
313313 type : TokenType ;
314314 regex : RegExp ;
315315 transform : ( s : string ) => string ;
316316 } ) : Token | undefined {
317- const matches = input . match ( regex ) ;
317+ regex . lastIndex = this . index ;
318+ const matches = this . input . match ( regex ) ;
318319 if ( matches ) {
320+ // Advance current position by matched token length
321+ this . index += matches [ 1 ] . length ;
319322 return {
320323 type,
321324 text : matches [ 1 ] ,
0 commit comments