Skip to content

Commit b6ee072

Browse files
committed
Optimize parsing of PHP files
Prior to this commit, PatternExpression is used inside the grammar for case-insensitive checks for e.g. keywords. This can be avoided by introducing a special CaseInsensitiveStringExpression that omits the relatively expensive regexes.
1 parent 1bf335b commit b6ee072

File tree

2 files changed

+99
-40
lines changed

2 files changed

+99
-40
lines changed
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* SonarQube PHP Plugin
3+
* Copyright (C) 2010-2025 SonarSource SA
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the Sonar Source-Available License Version 1, as published by SonarSource SA.
8+
*
9+
* This program is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12+
* See the Sonar Source-Available License for more details.
13+
*
14+
* You should have received a copy of the Sonar Source-Available License
15+
* along with this program; if not, see https://sonarsource.com/license/ssal/
16+
*/
17+
package org.sonar.php.parser;
18+
19+
import org.sonar.sslr.internal.matchers.Matcher;
20+
import org.sonar.sslr.internal.vm.Machine;
21+
import org.sonar.sslr.internal.vm.NativeExpression;
22+
import org.sonar.sslr.internal.vm.PatternExpression;
23+
import org.sonar.sslr.internal.vm.StringExpression;
24+
25+
/**
26+
* This is a variant of {@link StringExpression} which does case-insensitive
27+
* checks to avoid more expensive regex checks that would otherwise be done
28+
* through {@link PatternExpression}.
29+
*/
30+
public class CaseInsensitiveStringExpression extends NativeExpression implements Matcher {
31+
32+
private final String string;
33+
34+
public CaseInsensitiveStringExpression(String string) {
35+
this.string = string;
36+
}
37+
38+
@Override
39+
public void execute(Machine machine) {
40+
if (machine.length() < string.length()) {
41+
machine.backtrack();
42+
return;
43+
}
44+
for (int i = 0; i < string.length(); i++) {
45+
if (Character.toLowerCase(machine.charAt(i)) != Character.toLowerCase(string.charAt(i))) {
46+
machine.backtrack();
47+
return;
48+
}
49+
}
50+
machine.createLeafNode(this, string.length());
51+
machine.jump(1);
52+
}
53+
54+
@Override
55+
public String toString() {
56+
return "String " + string;
57+
}
58+
59+
}

php-frontend/src/main/java/org/sonar/php/parser/PHPLexicalGrammar.java

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ public static void lexical(LexerlessGrammarBuilder b) {
329329

330330
b.rule(EOF).is(b.token(GenericTokenType.EOF, b.endOfInput())).skip();
331331

332-
b.rule(NULL).is(word(b, "NULL")).skip();
332+
b.rule(NULL).is(word(b, "null")).skip();
333333
b.rule(CLASS_CONSTANT).is(word(b, "__CLASS__")).skip();
334334
b.rule(FILE_CONSTANT).is(word(b, "__FILE__")).skip();
335335
b.rule(DIR_CONSTANT).is(word(b, "__DIR__")).skip();
@@ -340,41 +340,41 @@ public static void lexical(LexerlessGrammarBuilder b) {
340340
b.rule(TRAIT_CONSTANT).is(word(b, "__TRAIT__")).skip();
341341
b.rule(ENUM).is(word(b, "enum")).skip();
342342

343-
b.rule(BOOLEAN_LITERAL).is(b.firstOf(word(b, "TRUE"), word(b, "FALSE")));
343+
b.rule(BOOLEAN_LITERAL).is(b.firstOf(word(b, "true"), word(b, "false")));
344344

345345
b.rule(NEXT_IS_DOLLAR).is(b.next(PHPPunctuator.DOLLAR));
346346
b.rule(VARIABLE_VARIABLE_DOLLAR).is(PHPPunctuator.DOLLAR, b.nextNot(b.firstOf(IDENTIFIER, KEYWORDS, PHPPunctuator.LCURLYBRACE)));
347347

348-
b.rule(ISSET).is(word(b, "ISSET")).skip();
349-
b.rule(EMPTY).is(word(b, "EMPTY")).skip();
350-
b.rule(INCLUDE_ONCE).is(word(b, "INCLUDE_ONCE")).skip();
351-
b.rule(INCLUDE).is(word(b, "INCLUDE")).skip();
352-
b.rule(EVAL).is(word(b, "EVAL")).skip();
353-
b.rule(REQUIRE_ONCE).is(word(b, "REQUIRE_ONCE")).skip();
354-
b.rule(REQUIRE).is(word(b, "REQUIRE")).skip();
355-
b.rule(CLONE).is(word(b, "CLONE")).skip();
356-
b.rule(PRINT).is(word(b, "PRINT")).skip();
357-
358-
b.rule(GET).is(word(b, "GET")).skip();
359-
b.rule(SET).is(word(b, "SET")).skip();
360-
361-
b.rule(SELF).is(word(b, "SELF")).skip();
362-
b.rule(PARENT).is(word(b, "PARENT")).skip();
363-
364-
b.rule(MIXED).is(word(b, "MIXED")).skip();
365-
b.rule(INTEGER).is(word(b, "INTEGER")).skip();
366-
b.rule(INT).is(word(b, "INT")).skip();
367-
b.rule(DOUBLE).is(word(b, "DOUBLE")).skip();
368-
b.rule(FLOAT).is(word(b, "FLOAT")).skip();
369-
b.rule(REAL).is(word(b, "REAL")).skip();
370-
b.rule(STRING).is(word(b, "STRING")).skip();
371-
b.rule(OBJECT).is(word(b, "OBJECT")).skip();
372-
b.rule(BOOLEAN).is(word(b, "BOOLEAN")).skip();
373-
b.rule(BOOL).is(word(b, "BOOL")).skip();
374-
b.rule(BINARY).is(word(b, "BINARY")).skip();
375-
b.rule(ITERABLE).is(word(b, "ITERABLE")).skip();
376-
377-
b.rule(FROM).is(word(b, "FROM")).skip();
348+
b.rule(ISSET).is(word(b, "isset")).skip();
349+
b.rule(EMPTY).is(word(b, "empty")).skip();
350+
b.rule(INCLUDE_ONCE).is(word(b, "include_once")).skip();
351+
b.rule(INCLUDE).is(word(b, "include")).skip();
352+
b.rule(EVAL).is(word(b, "eval")).skip();
353+
b.rule(REQUIRE_ONCE).is(word(b, "require_once")).skip();
354+
b.rule(REQUIRE).is(word(b, "require")).skip();
355+
b.rule(CLONE).is(word(b, "clone")).skip();
356+
b.rule(PRINT).is(word(b, "print")).skip();
357+
358+
b.rule(GET).is(word(b, "get")).skip();
359+
b.rule(SET).is(word(b, "set")).skip();
360+
361+
b.rule(SELF).is(word(b, "self")).skip();
362+
b.rule(PARENT).is(word(b, "parent")).skip();
363+
364+
b.rule(MIXED).is(word(b, "mixed")).skip();
365+
b.rule(INTEGER).is(word(b, "integer")).skip();
366+
b.rule(INT).is(word(b, "int")).skip();
367+
b.rule(DOUBLE).is(word(b, "double")).skip();
368+
b.rule(FLOAT).is(word(b, "float")).skip();
369+
b.rule(REAL).is(word(b, "real")).skip();
370+
b.rule(STRING).is(word(b, "string")).skip();
371+
b.rule(OBJECT).is(word(b, "object")).skip();
372+
b.rule(BOOLEAN).is(word(b, "boolean")).skip();
373+
b.rule(BOOL).is(word(b, "bool")).skip();
374+
b.rule(BINARY).is(word(b, "binary")).skip();
375+
b.rule(ITERABLE).is(word(b, "iterable")).skip();
376+
377+
b.rule(FROM).is(word(b, "from")).skip();
378378

379379
}
380380

@@ -385,21 +385,21 @@ private static void keywords(LexerlessGrammarBuilder b) {
385385
PHPKeyword tokenType = PHPKeyword.values()[i];
386386

387387
// PHP keywords are case insensitive
388-
b.rule(tokenType).is(SPACING, keywordRegexp(b, tokenType.getValue()), b.nextNot(b.regexp(LexicalConstant.IDENTIFIER_PART))).skip();
388+
b.rule(tokenType).is(SPACING, caseInsensitive(tokenType.getValue()), b.nextNot(b.regexp(LexicalConstant.IDENTIFIER_PART))).skip();
389389
if (i > 1) {
390390
if (tokenType == PHPKeyword.READONLY) {
391391
// Readonly is only a keyword when it is not used as a function name. SONARPHP-1266
392-
rest[i - 2] = b.sequence(keywordRegexp(b, "readonly"), b.nextNot(b.regexp("[\\s]*\\(")));
392+
rest[i - 2] = b.sequence(caseInsensitive("readonly"), b.nextNot(b.regexp("[\\s]*\\(")));
393393
} else {
394-
rest[i - 2] = keywordRegexp(b, tokenType.getValue());
394+
rest[i - 2] = caseInsensitive(tokenType.getValue());
395395
}
396396
}
397397
}
398398

399399
b.rule(KEYWORDS).is(SPACING,
400400
b.firstOf(
401-
keywordRegexp(b, PHPKeyword.getKeywordValues()[0]),
402-
keywordRegexp(b, PHPKeyword.getKeywordValues()[1]),
401+
caseInsensitive(PHPKeyword.getKeywordValues()[0]),
402+
caseInsensitive(PHPKeyword.getKeywordValues()[1]),
403403
rest),
404404
b.nextNot(b.regexp(LexicalConstant.IDENTIFIER_PART)));
405405
}
@@ -411,10 +411,10 @@ private static void punctuators(LexerlessGrammarBuilder b) {
411411
}
412412

413413
private static Object word(LexerlessGrammarBuilder b, String word) {
414-
return b.sequence(SPACING, b.regexp("(?i)" + word), b.nextNot(b.regexp(LexicalConstant.IDENTIFIER_PART)));
414+
return b.sequence(SPACING, caseInsensitive(word), b.nextNot(b.regexp(LexicalConstant.IDENTIFIER_PART)));
415415
}
416416

417-
private static Object keywordRegexp(LexerlessGrammarBuilder b, String keywordValue) {
418-
return b.regexp("(?i)" + keywordValue);
417+
private static Object caseInsensitive(String value) {
418+
return new CaseInsensitiveStringExpression(value);
419419
}
420420
}

0 commit comments

Comments
 (0)