Skip to content

Commit f939ccf

Browse files
authored
Added option to use re2j for CSS query regexes (#2407)
Added option to use re2j for CSS query regexes Re2j is a linear-time regex engine. This enables jsoup users to now safely accept arbitrary CSS regex-based queries.
1 parent c4bf5ba commit f939ccf

File tree

12 files changed

+338
-39
lines changed

12 files changed

+338
-39
lines changed

CHANGES.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,16 @@
33
## 1.22.1 (PENDING)
44

55
### Improvements
6+
* Added support for using the re2j regular expression engine for CSS selectors, which ensures linear-time performance for regex evaluation. This enables safe handling of arbitrary user-supplied query regexes. To enable, add the `com.google.re2j` dependency to your classpath, e.g.:
7+
```xml
8+
<dependency>
9+
<groupId>com.google.re2j</groupId>
10+
<artifactId>re2j</artifactId>
11+
<version>1.8</version>
12+
</dependency>
13+
```
14+
(If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.) [#2407](https://github.com/jhy/jsoup/pull/2407)
15+
616
* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser’s configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
717
* Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403)
818
* Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041)

pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,15 @@
544544
<version>1.0.0</version>
545545
<scope>provided</scope>
546546
</dependency>
547+
548+
<dependency>
549+
<!-- re2j; linear time regex, with 3-clause BSD license -->
550+
<groupId>com.google.re2j</groupId>
551+
<artifactId>re2j</artifactId>
552+
<version>1.8</version>
553+
<optional>true</optional>
554+
<scope>compile</scope>
555+
</dependency>
547556
</dependencies>
548557

549558
<dependencyManagement>
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package org.jsoup.helper;
2+
3+
/**
4+
re2j-backed Regex implementation; must only be touched when re2j is on the classpath.
5+
*/
6+
final class Re2jRegex extends Regex {
7+
private static final java.util.regex.Pattern unused = java.util.regex.Pattern.compile("");
8+
9+
private final com.google.re2j.Pattern re2jPattern;
10+
11+
private Re2jRegex(com.google.re2j.Pattern re2jPattern) {
12+
super(unused);
13+
this.re2jPattern = re2jPattern;
14+
}
15+
16+
public static Regex compile(String regex) {
17+
try {
18+
return new Re2jRegex(com.google.re2j.Pattern.compile(regex));
19+
} catch (RuntimeException e) {
20+
throw new ValidationException("Pattern syntax error: " + e.getMessage());
21+
}
22+
}
23+
24+
@Override
25+
public Matcher matcher(CharSequence input) {
26+
return new Re2jMatcher(re2jPattern.matcher(input));
27+
}
28+
29+
@Override
30+
public String toString() {
31+
return re2jPattern.toString();
32+
}
33+
34+
private static final class Re2jMatcher implements Matcher {
35+
private final com.google.re2j.Matcher delegate;
36+
37+
Re2jMatcher(com.google.re2j.Matcher delegate) {
38+
this.delegate = delegate;
39+
}
40+
41+
@Override
42+
public boolean find() {
43+
return delegate.find();
44+
}
45+
}
46+
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package org.jsoup.helper;
2+
3+
import org.jsoup.internal.SharedConstants;
4+
5+
import java.lang.reflect.InvocationTargetException;
6+
import java.lang.reflect.Method;
7+
import java.util.regex.Pattern;
8+
import java.util.regex.PatternSyntaxException;
9+
10+
/**
11+
A regular expression abstraction. Allows jsoup to optionally use the re2j regular expression engine (linear time)
12+
instead of the JDK's backtracking regex implementation.
13+
14+
<p>If the {@code com.google.re2j} library is found on the classpath, by default it will be used. You can override this
15+
by setting {@code -Djsoup.useRe2j=false} to explicitly disable, and use the JDK regex engine.</p>
16+
17+
<p>(Currently this a simplified implementation for jsoup's specific use; can extend as required.)</p>
18+
*/
19+
public class Regex {
20+
private static final boolean hasRe2j = hasRe2j();
21+
22+
private final Pattern jdkPattern;
23+
24+
Regex(Pattern jdkPattern) {
25+
this.jdkPattern = jdkPattern;
26+
}
27+
28+
/**
29+
Compile a regex, using re2j if enabled and available; otherwise JDK regex.
30+
31+
@param regex the regex to compile
32+
@return the compiled regex
33+
@throws ValidationException if the regex is invalid
34+
*/
35+
public static Regex compile(String regex) {
36+
if (hasRe2j && wantsRe2j()) {
37+
return Re2jRegex.compile(regex);
38+
}
39+
40+
try {
41+
return new Regex(Pattern.compile(regex));
42+
} catch (PatternSyntaxException e) {
43+
throw new ValidationException("Pattern syntax error: " + e.getMessage());
44+
}
45+
}
46+
47+
/** Wraps an existing JDK Pattern (for API compat); doesn't switch */
48+
public static Regex fromPattern(Pattern pattern) {
49+
return new Regex(pattern);
50+
}
51+
52+
static boolean wantsRe2j() {
53+
return Boolean.parseBoolean(System.getProperty(SharedConstants.UseRe2j, "true"));
54+
}
55+
56+
static void wantsRe2j(boolean use) {
57+
System.setProperty(SharedConstants.UseRe2j, Boolean.toString(use));
58+
}
59+
60+
static boolean hasRe2j() {
61+
try {
62+
Class<?> re2 = Class.forName("com.google.re2j.Pattern", false, Regex.class.getClassLoader()); // check if re2j is in classpath
63+
try {
64+
// if it is, and we are on JVM9+, we need to dork around with modules, because re2j doesn't publish a module name.
65+
// done via reflection so we can still run on JVM 8.
66+
// todo remove if re2j publishes as a module
67+
Class<?> moduleCls = Class.forName("java.lang.Module");
68+
Method getModule = Class.class.getMethod("getModule");
69+
Object jsoupMod = getModule.invoke(Regex.class);
70+
Object re2Mod = getModule.invoke(re2);
71+
boolean reads = (boolean) moduleCls.getMethod("canRead", moduleCls).invoke(jsoupMod, re2Mod);
72+
if (!reads) moduleCls.getMethod("addReads", moduleCls).invoke(jsoupMod, re2Mod);
73+
} catch (ClassNotFoundException ignore) {
74+
// jvm8 - no Module class; so we can use as-is
75+
}
76+
return true;
77+
} catch (ClassNotFoundException e) {
78+
return false; // no re2j
79+
} catch (ReflectiveOperationException e) {
80+
// unexpectedly couldn’t wire modules on 9+; return false to avoid IllegalAccessError later
81+
System.err.println("Warning: (bug? please report) couldn't access re2j from jsoup due to modules: " + e);
82+
return false;
83+
}
84+
}
85+
86+
public Matcher matcher(CharSequence input) {
87+
return new JdkMatcher(jdkPattern.matcher(input));
88+
}
89+
90+
@Override
91+
public String toString() {
92+
return jdkPattern.toString();
93+
}
94+
95+
public interface Matcher {
96+
boolean find();
97+
}
98+
99+
private static final class JdkMatcher implements Matcher {
100+
private final java.util.regex.Matcher delegate;
101+
102+
JdkMatcher(java.util.regex.Matcher delegate) {
103+
this.delegate = delegate;
104+
}
105+
106+
@Override
107+
public boolean find() {
108+
return delegate.find();
109+
}
110+
}
111+
}

src/main/java/org/jsoup/internal/SharedConstants.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,7 @@ public final class SharedConstants {
2121

2222
public static final String UseHttpClient = "jsoup.useHttpClient";
2323

24+
public static final String UseRe2j = "jsoup.useRe2j"; // enables use of the re2j regular expression engine when true and it's on the classpath
25+
2426
private SharedConstants() {}
2527
}

src/main/java/org/jsoup/nodes/Element.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import org.jsoup.helper.Validate;
44
import org.jsoup.internal.Normalizer;
55
import org.jsoup.internal.QuietAppendable;
6+
import org.jsoup.helper.Regex;
67
import org.jsoup.internal.StringUtil;
78
import org.jsoup.parser.ParseSettings;
89
import org.jsoup.parser.Parser;
@@ -1404,7 +1405,6 @@ public Elements getElementsByAttributeValueContaining(String key, String match)
14041405
*/
14051406
public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) {
14061407
return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
1407-
14081408
}
14091409

14101410
/**
@@ -1414,13 +1414,13 @@ public Elements getElementsByAttributeValueMatching(String key, Pattern pattern)
14141414
* @return elements that have attributes matching this regular expression
14151415
*/
14161416
public Elements getElementsByAttributeValueMatching(String key, String regex) {
1417-
Pattern pattern;
1417+
Regex pattern;
14181418
try {
1419-
pattern = Pattern.compile(regex);
1419+
pattern = Regex.compile(regex);
14201420
} catch (PatternSyntaxException e) {
14211421
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
14221422
}
1423-
return getElementsByAttributeValueMatching(key, pattern);
1423+
return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
14241424
}
14251425

14261426
/**
@@ -1489,13 +1489,13 @@ public Elements getElementsMatchingText(Pattern pattern) {
14891489
* @see Element#text()
14901490
*/
14911491
public Elements getElementsMatchingText(String regex) {
1492-
Pattern pattern;
1492+
Regex pattern;
14931493
try {
1494-
pattern = Pattern.compile(regex);
1494+
pattern = Regex.compile(regex);
14951495
} catch (PatternSyntaxException e) {
14961496
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
14971497
}
1498-
return getElementsMatchingText(pattern);
1498+
return Collector.collect(new Evaluator.Matches(pattern), this);
14991499
}
15001500

15011501
/**
@@ -1515,13 +1515,13 @@ public Elements getElementsMatchingOwnText(Pattern pattern) {
15151515
* @see Element#ownText()
15161516
*/
15171517
public Elements getElementsMatchingOwnText(String regex) {
1518-
Pattern pattern;
1518+
Regex pattern;
15191519
try {
1520-
pattern = Pattern.compile(regex);
1520+
pattern = Regex.compile(regex);
15211521
} catch (PatternSyntaxException e) {
15221522
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
15231523
}
1524-
return getElementsMatchingOwnText(pattern);
1524+
return Collector.collect(new Evaluator.MatchesOwn(pattern), this);
15251525
}
15261526

15271527
/**

src/main/java/org/jsoup/select/Evaluator.java

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
import org.jsoup.nodes.TextNode;
1212
import org.jsoup.nodes.XmlDeclaration;
1313
import org.jsoup.parser.ParseSettings;
14+
import org.jsoup.helper.Regex;
1415

1516
import java.util.List;
1617
import java.util.function.Predicate;
17-
import java.util.regex.Matcher;
1818
import java.util.regex.Pattern;
1919

2020
import static org.jsoup.internal.Normalizer.lowerCase;
@@ -385,13 +385,17 @@ public String toString() {
385385
*/
386386
public static final class AttributeWithValueMatching extends Evaluator {
387387
final String key;
388-
final Pattern pattern;
388+
final Regex pattern;
389389

390-
public AttributeWithValueMatching(String key, Pattern pattern) {
390+
public AttributeWithValueMatching(String key, Regex pattern) {
391391
this.key = normalize(key);
392392
this.pattern = pattern;
393393
}
394394

395+
public AttributeWithValueMatching(String key, Pattern pattern) {
396+
this(key, Regex.fromPattern(pattern)); // api compat
397+
}
398+
395399
@Override
396400
public boolean matches(Element root, Element element) {
397401
return element.hasAttr(key) && pattern.matcher(element.attr(key)).find();
@@ -924,16 +928,19 @@ public String toString() {
924928
* Evaluator for matching Element (and its descendants) text with regex
925929
*/
926930
public static final class Matches extends Evaluator {
927-
private final Pattern pattern;
931+
private final Regex pattern;
928932

929-
public Matches(Pattern pattern) {
933+
public Matches(Regex pattern) {
930934
this.pattern = pattern;
931935
}
932936

937+
public Matches(Pattern pattern) {
938+
this(Regex.fromPattern(pattern));
939+
}
940+
933941
@Override
934942
public boolean matches(Element root, Element element) {
935-
Matcher m = pattern.matcher(element.text());
936-
return m.find();
943+
return pattern.matcher(element.text()).find();
937944
}
938945

939946
@Override protected int cost() {
@@ -950,16 +957,19 @@ public String toString() {
950957
* Evaluator for matching Element's own text with regex
951958
*/
952959
public static final class MatchesOwn extends Evaluator {
953-
private final Pattern pattern;
960+
private final Regex pattern;
954961

955-
public MatchesOwn(Pattern pattern) {
962+
public MatchesOwn(Regex pattern) {
956963
this.pattern = pattern;
957964
}
958965

966+
public MatchesOwn(Pattern pattern) {
967+
this(Regex.fromPattern(pattern));
968+
}
969+
959970
@Override
960971
public boolean matches(Element root, Element element) {
961-
Matcher m = pattern.matcher(element.ownText());
962-
return m.find();
972+
return pattern.matcher(element.ownText()).find();
963973
}
964974

965975
@Override protected int cost() {
@@ -977,16 +987,19 @@ public String toString() {
977987
* @since 1.15.1.
978988
*/
979989
public static final class MatchesWholeText extends Evaluator {
980-
private final Pattern pattern;
990+
private final Regex pattern;
981991

982-
public MatchesWholeText(Pattern pattern) {
992+
public MatchesWholeText(Regex pattern) {
983993
this.pattern = pattern;
984994
}
985995

996+
public MatchesWholeText(Pattern pattern) {
997+
this.pattern = Regex.fromPattern(pattern);
998+
}
999+
9861000
@Override
9871001
public boolean matches(Element root, Element element) {
988-
Matcher m = pattern.matcher(element.wholeText());
989-
return m.find();
1002+
return pattern.matcher(element.wholeText()).find();
9901003
}
9911004

9921005
@Override protected int cost() {
@@ -1004,15 +1017,19 @@ public String toString() {
10041017
* @since 1.15.1.
10051018
*/
10061019
public static final class MatchesWholeOwnText extends Evaluator {
1007-
private final Pattern pattern;
1020+
private final Regex pattern;
10081021

1009-
public MatchesWholeOwnText(Pattern pattern) {
1022+
public MatchesWholeOwnText(Regex pattern) {
10101023
this.pattern = pattern;
10111024
}
10121025

1026+
public MatchesWholeOwnText(Pattern pattern) {
1027+
this(Regex.fromPattern(pattern));
1028+
}
1029+
10131030
@Override
10141031
public boolean matches(Element root, Element element) {
1015-
Matcher m = pattern.matcher(element.wholeOwnText());
1032+
Regex.Matcher m = pattern.matcher(element.wholeOwnText());
10161033
return m.find();
10171034
}
10181035

0 commit comments

Comments
 (0)