Skip to content

Commit 2033538

Browse files
committed
Added option to use re2j for CSS query regexes
Re2j is a linear-time regex engine. This enables jsoup users to now safely accept arbitrary CSS regex-based queries.
1 parent c4bf5ba commit 2033538

File tree

12 files changed

+319
-39
lines changed

12 files changed

+319
-39
lines changed

CHANGES.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,16 @@
33
## 1.22.1 (PENDING)
44

55
### Improvements
6+
* Added support for using the re2j regular expression engine for CSS selectors, which ensures linear-time performance for regex evaluation. This enables safe handling of arbitrary user-supplied query regexes. To enable, add the `com.google.re2j` dependency to your classpath, e.g.:
7+
```xml
8+
<dependency>
9+
<groupId>com.google.re2j</groupId>
10+
<artifactId>re2j</artifactId>
11+
<version>1.8</version>
12+
</dependency>
13+
```
14+
(If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.)
15+
616
* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser’s configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
717
* Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403)
818
* Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041)

pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,15 @@
544544
<version>1.0.0</version>
545545
<scope>provided</scope>
546546
</dependency>
547+
548+
<dependency>
549+
<!-- re2j; linear time regex, with 3-clause BSD license -->
550+
<groupId>com.google.re2j</groupId>
551+
<artifactId>re2j</artifactId>
552+
<version>1.8</version>
553+
<optional>true</optional>
554+
<scope>compile</scope>
555+
</dependency>
547556
</dependencies>
548557

549558
<dependencyManagement>
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package org.jsoup.helper;
2+
3+
/**
4+
re2j-backed Regex implementation; must only be touched when re2j is on the classpath.
5+
*/
6+
final class Re2jRegex extends Regex {
7+
private static final java.util.regex.Pattern unused = java.util.regex.Pattern.compile("");
8+
9+
private final com.google.re2j.Pattern re2jPattern;
10+
11+
private Re2jRegex(com.google.re2j.Pattern re2jPattern) {
12+
super(unused);
13+
this.re2jPattern = re2jPattern;
14+
}
15+
16+
public static Regex compile(String regex) {
17+
try {
18+
return new Re2jRegex(com.google.re2j.Pattern.compile(regex));
19+
} catch (RuntimeException e) {
20+
throw new ValidationException("Pattern syntax error: " + e.getMessage());
21+
}
22+
}
23+
24+
@Override
25+
public Matcher matcher(CharSequence input) {
26+
return new Re2jMatcher(re2jPattern.matcher(input));
27+
}
28+
29+
@Override
30+
public String toString() {
31+
return re2jPattern.toString();
32+
}
33+
34+
private static final class Re2jMatcher implements Matcher {
35+
private final com.google.re2j.Matcher delegate;
36+
37+
Re2jMatcher(com.google.re2j.Matcher delegate) {
38+
this.delegate = delegate;
39+
}
40+
41+
@Override
42+
public boolean find() {
43+
return delegate.find();
44+
}
45+
}
46+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package org.jsoup.helper;
2+
3+
import org.jsoup.internal.SharedConstants;
4+
5+
import java.util.regex.Pattern;
6+
import java.util.regex.PatternSyntaxException;
7+
8+
/**
9+
A regular expression abstraction. Allows jsoup to optionally use the re2j regular expression engine (linear time)
10+
instead of the JDK's backtracking regex implementation.
11+
12+
<p>If the {@code com.google.re2j} library is found on the classpath, by default it will be used. You can override this
13+
by setting {@code -Djsoup.useRe2j=false} to explicitly disable, and use the JDK regex engine.</p>
14+
15+
<p>(Currently this a simplified implementation for jsoup's specific use; can extend as required.)</p>
16+
*/
17+
public class Regex {
18+
private static final boolean hasRe2j = hasRe2j();
19+
20+
private final Pattern jdkPattern;
21+
22+
Regex(Pattern jdkPattern) {
23+
this.jdkPattern = jdkPattern;
24+
}
25+
26+
/**
27+
Compile a regex, using re2j if enabled and available; otherwise JDK regex.
28+
29+
@param regex the regex to compile
30+
@return the compiled regex
31+
@throws ValidationException if the regex is invalid
32+
*/
33+
public static Regex compile(String regex) {
34+
if (hasRe2j && wantsRe2j()) {
35+
return Re2jRegex.compile(regex);
36+
}
37+
38+
try {
39+
return new Regex(Pattern.compile(regex));
40+
} catch (PatternSyntaxException e) {
41+
throw new ValidationException("Pattern syntax error: " + e.getMessage());
42+
}
43+
}
44+
45+
/** Wraps an existing JDK Pattern (for API compat); doesn't switch */
46+
public static Regex fromPattern(Pattern pattern) {
47+
return new Regex(pattern);
48+
}
49+
50+
static boolean wantsRe2j() {
51+
return Boolean.parseBoolean(System.getProperty(SharedConstants.UseRe2j, "true"));
52+
}
53+
54+
static void wantsRe2j(boolean use) {
55+
System.setProperty(SharedConstants.UseRe2j, Boolean.toString(use));
56+
}
57+
58+
static boolean hasRe2j() {
59+
try {
60+
Class.forName("com.google.re2j.Pattern", false, Regex.class.getClassLoader());
61+
return true;
62+
} catch (ClassNotFoundException e) {
63+
return false;
64+
}
65+
}
66+
67+
public Matcher matcher(CharSequence input) {
68+
return new JdkMatcher(jdkPattern.matcher(input));
69+
}
70+
71+
@Override
72+
public String toString() {
73+
return jdkPattern.toString();
74+
}
75+
76+
public interface Matcher {
77+
boolean find();
78+
}
79+
80+
private static final class JdkMatcher implements Matcher {
81+
private final java.util.regex.Matcher delegate;
82+
83+
JdkMatcher(java.util.regex.Matcher delegate) {
84+
this.delegate = delegate;
85+
}
86+
87+
@Override
88+
public boolean find() {
89+
return delegate.find();
90+
}
91+
}
92+
}

src/main/java/org/jsoup/internal/SharedConstants.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,7 @@ public final class SharedConstants {
2121

2222
public static final String UseHttpClient = "jsoup.useHttpClient";
2323

24+
public static final String UseRe2j = "jsoup.useRe2j"; // enables use of the re2j regular expression engine when true and it's on the classpath
25+
2426
private SharedConstants() {}
2527
}

src/main/java/org/jsoup/nodes/Element.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import org.jsoup.helper.Validate;
44
import org.jsoup.internal.Normalizer;
55
import org.jsoup.internal.QuietAppendable;
6+
import org.jsoup.helper.Regex;
67
import org.jsoup.internal.StringUtil;
78
import org.jsoup.parser.ParseSettings;
89
import org.jsoup.parser.Parser;
@@ -1404,7 +1405,6 @@ public Elements getElementsByAttributeValueContaining(String key, String match)
14041405
*/
14051406
public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) {
14061407
return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
1407-
14081408
}
14091409

14101410
/**
@@ -1414,13 +1414,13 @@ public Elements getElementsByAttributeValueMatching(String key, Pattern pattern)
14141414
* @return elements that have attributes matching this regular expression
14151415
*/
14161416
public Elements getElementsByAttributeValueMatching(String key, String regex) {
1417-
Pattern pattern;
1417+
Regex pattern;
14181418
try {
1419-
pattern = Pattern.compile(regex);
1419+
pattern = Regex.compile(regex);
14201420
} catch (PatternSyntaxException e) {
14211421
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
14221422
}
1423-
return getElementsByAttributeValueMatching(key, pattern);
1423+
return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
14241424
}
14251425

14261426
/**
@@ -1489,13 +1489,13 @@ public Elements getElementsMatchingText(Pattern pattern) {
14891489
* @see Element#text()
14901490
*/
14911491
public Elements getElementsMatchingText(String regex) {
1492-
Pattern pattern;
1492+
Regex pattern;
14931493
try {
1494-
pattern = Pattern.compile(regex);
1494+
pattern = Regex.compile(regex);
14951495
} catch (PatternSyntaxException e) {
14961496
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
14971497
}
1498-
return getElementsMatchingText(pattern);
1498+
return Collector.collect(new Evaluator.Matches(pattern), this);
14991499
}
15001500

15011501
/**
@@ -1515,13 +1515,13 @@ public Elements getElementsMatchingOwnText(Pattern pattern) {
15151515
* @see Element#ownText()
15161516
*/
15171517
public Elements getElementsMatchingOwnText(String regex) {
1518-
Pattern pattern;
1518+
Regex pattern;
15191519
try {
1520-
pattern = Pattern.compile(regex);
1520+
pattern = Regex.compile(regex);
15211521
} catch (PatternSyntaxException e) {
15221522
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
15231523
}
1524-
return getElementsMatchingOwnText(pattern);
1524+
return Collector.collect(new Evaluator.MatchesOwn(pattern), this);
15251525
}
15261526

15271527
/**

src/main/java/org/jsoup/select/Evaluator.java

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
import org.jsoup.nodes.TextNode;
1212
import org.jsoup.nodes.XmlDeclaration;
1313
import org.jsoup.parser.ParseSettings;
14+
import org.jsoup.helper.Regex;
1415

1516
import java.util.List;
1617
import java.util.function.Predicate;
17-
import java.util.regex.Matcher;
1818
import java.util.regex.Pattern;
1919

2020
import static org.jsoup.internal.Normalizer.lowerCase;
@@ -385,13 +385,17 @@ public String toString() {
385385
*/
386386
public static final class AttributeWithValueMatching extends Evaluator {
387387
final String key;
388-
final Pattern pattern;
388+
final Regex pattern;
389389

390-
public AttributeWithValueMatching(String key, Pattern pattern) {
390+
public AttributeWithValueMatching(String key, Regex pattern) {
391391
this.key = normalize(key);
392392
this.pattern = pattern;
393393
}
394394

395+
public AttributeWithValueMatching(String key, Pattern pattern) {
396+
this(key, Regex.fromPattern(pattern)); // api compat
397+
}
398+
395399
@Override
396400
public boolean matches(Element root, Element element) {
397401
return element.hasAttr(key) && pattern.matcher(element.attr(key)).find();
@@ -924,16 +928,19 @@ public String toString() {
924928
* Evaluator for matching Element (and its descendants) text with regex
925929
*/
926930
public static final class Matches extends Evaluator {
927-
private final Pattern pattern;
931+
private final Regex pattern;
928932

929-
public Matches(Pattern pattern) {
933+
public Matches(Regex pattern) {
930934
this.pattern = pattern;
931935
}
932936

937+
public Matches(Pattern pattern) {
938+
this(Regex.fromPattern(pattern));
939+
}
940+
933941
@Override
934942
public boolean matches(Element root, Element element) {
935-
Matcher m = pattern.matcher(element.text());
936-
return m.find();
943+
return pattern.matcher(element.text()).find();
937944
}
938945

939946
@Override protected int cost() {
@@ -950,16 +957,19 @@ public String toString() {
950957
* Evaluator for matching Element's own text with regex
951958
*/
952959
public static final class MatchesOwn extends Evaluator {
953-
private final Pattern pattern;
960+
private final Regex pattern;
954961

955-
public MatchesOwn(Pattern pattern) {
962+
public MatchesOwn(Regex pattern) {
956963
this.pattern = pattern;
957964
}
958965

966+
public MatchesOwn(Pattern pattern) {
967+
this(Regex.fromPattern(pattern));
968+
}
969+
959970
@Override
960971
public boolean matches(Element root, Element element) {
961-
Matcher m = pattern.matcher(element.ownText());
962-
return m.find();
972+
return pattern.matcher(element.ownText()).find();
963973
}
964974

965975
@Override protected int cost() {
@@ -977,16 +987,19 @@ public String toString() {
977987
* @since 1.15.1.
978988
*/
979989
public static final class MatchesWholeText extends Evaluator {
980-
private final Pattern pattern;
990+
private final Regex pattern;
981991

982-
public MatchesWholeText(Pattern pattern) {
992+
public MatchesWholeText(Regex pattern) {
983993
this.pattern = pattern;
984994
}
985995

996+
public MatchesWholeText(Pattern pattern) {
997+
this.pattern = Regex.fromPattern(pattern);
998+
}
999+
9861000
@Override
9871001
public boolean matches(Element root, Element element) {
988-
Matcher m = pattern.matcher(element.wholeText());
989-
return m.find();
1002+
return pattern.matcher(element.wholeText()).find();
9901003
}
9911004

9921005
@Override protected int cost() {
@@ -1004,15 +1017,19 @@ public String toString() {
10041017
* @since 1.15.1.
10051018
*/
10061019
public static final class MatchesWholeOwnText extends Evaluator {
1007-
private final Pattern pattern;
1020+
private final Regex pattern;
10081021

1009-
public MatchesWholeOwnText(Pattern pattern) {
1022+
public MatchesWholeOwnText(Regex pattern) {
10101023
this.pattern = pattern;
10111024
}
10121025

1026+
public MatchesWholeOwnText(Pattern pattern) {
1027+
this(Regex.fromPattern(pattern));
1028+
}
1029+
10131030
@Override
10141031
public boolean matches(Element root, Element element) {
1015-
Matcher m = pattern.matcher(element.wholeOwnText());
1032+
Regex.Matcher m = pattern.matcher(element.wholeOwnText());
10161033
return m.find();
10171034
}
10181035

src/main/java/org/jsoup/select/NodeEvaluator.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
import org.jsoup.nodes.Element;
55
import org.jsoup.nodes.LeafNode;
66
import org.jsoup.nodes.Node;
7-
8-
import java.util.regex.Pattern;
7+
import org.jsoup.helper.Regex;
98

109
import static org.jsoup.internal.Normalizer.lowerCase;
1110
import static org.jsoup.internal.StringUtil.normaliseWhitespace;
@@ -98,9 +97,9 @@ public String toString() {
9897
}
9998

10099
static class MatchesValue extends NodeEvaluator {
101-
private final Pattern pattern;
100+
private final Regex pattern;
102101

103-
protected MatchesValue(Pattern pattern) {
102+
protected MatchesValue(Regex pattern) {
104103
this.pattern = pattern;
105104
}
106105

0 commit comments

Comments
 (0)