Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
## 1.22.1 (PENDING)

### Improvements
* Added support for using the re2j regular expression engine for CSS selectors, which ensures linear-time performance for regex evaluation. This enables safe handling of arbitrary user-supplied query regexes. To enable, add the `com.google.re2j` dependency to your classpath, e.g.:
```xml
<dependency>
<groupId>com.google.re2j</groupId>
<artifactId>re2j</artifactId>
<version>1.8</version>
</dependency>
```
(If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.) [#2407](https://github.com/jhy/jsoup/pull/2407)

* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser’s configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
* Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403)
* Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041)
Expand Down
9 changes: 9 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,15 @@
<version>1.0.0</version>
<scope>provided</scope>
</dependency>

<dependency>
<!-- re2j; linear time regex, with 3-clause BSD license -->
<groupId>com.google.re2j</groupId>
<artifactId>re2j</artifactId>
<version>1.8</version>
<optional>true</optional>
<scope>compile</scope>
</dependency>
</dependencies>

<dependencyManagement>
Expand Down
46 changes: 46 additions & 0 deletions src/main/java/org/jsoup/helper/Re2jRegex.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package org.jsoup.helper;

/**
re2j-backed Regex implementation; must only be touched when re2j is on the classpath.
*/
final class Re2jRegex extends Regex {
private static final java.util.regex.Pattern unused = java.util.regex.Pattern.compile("");

private final com.google.re2j.Pattern re2jPattern;

private Re2jRegex(com.google.re2j.Pattern re2jPattern) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Concrete references to com.google.re2j.* classes means re2j is non-optional or the android r8 shrinker throws an error

This was discovered while attempting to adopt the new jsoup version in the AnkiDroid project, where we have at a non-trivial development cost implemented "release mode" emulator testing after having shrinker errors in the past. It surfaced the problem

ankidroid/Anki-Android#19985
https://github.com/ankidroid/Anki-Android/actions/runs/20636686551/job/59315193515#step:8:298

An initial attempt to solve the issue was made by simply adding the observationally-non-optional re2j transitive but that is unsatisfactory as we have also implemented an APK size comparison tool to see the impact of such changes, and re2j appears to add approximately 70 kilobytes to our APK size. That is not huge but is best avoided for something that is in our use case not going to result in a user-perceivable benefit

Can this intended-to-be-optional transitive be made actually optional here? Perhaps through reflective usage in the Re2jRegex helper similar to the "has re2j" detection elsewhere?

Thanks!

super(unused);
this.re2jPattern = re2jPattern;
}

public static Regex compile(String regex) {
try {
return new Re2jRegex(com.google.re2j.Pattern.compile(regex));
} catch (RuntimeException e) {
throw new ValidationException("Pattern syntax error: " + e.getMessage());
}
}

@Override
public Matcher matcher(CharSequence input) {
return new Re2jMatcher(re2jPattern.matcher(input));
}

@Override
public String toString() {
return re2jPattern.toString();
}

private static final class Re2jMatcher implements Matcher {
private final com.google.re2j.Matcher delegate;

Re2jMatcher(com.google.re2j.Matcher delegate) {
this.delegate = delegate;
}

@Override
public boolean find() {
return delegate.find();
}
}
}
111 changes: 111 additions & 0 deletions src/main/java/org/jsoup/helper/Regex.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package org.jsoup.helper;

import org.jsoup.internal.SharedConstants;

import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/**
A regular expression abstraction. Allows jsoup to optionally use the re2j regular expression engine (linear time)
instead of the JDK's backtracking regex implementation.
<p>If the {@code com.google.re2j} library is found on the classpath, by default it will be used. You can override this
by setting {@code -Djsoup.useRe2j=false} to explicitly disable, and use the JDK regex engine.</p>
<p>(Currently this a simplified implementation for jsoup's specific use; can extend as required.)</p>
*/
public class Regex {
private static final boolean hasRe2j = hasRe2j();

private final Pattern jdkPattern;

Regex(Pattern jdkPattern) {
this.jdkPattern = jdkPattern;
}

/**
Compile a regex, using re2j if enabled and available; otherwise JDK regex.
@param regex the regex to compile
@return the compiled regex
@throws ValidationException if the regex is invalid
*/
public static Regex compile(String regex) {
if (hasRe2j && wantsRe2j()) {
return Re2jRegex.compile(regex);
}

try {
return new Regex(Pattern.compile(regex));
} catch (PatternSyntaxException e) {
throw new ValidationException("Pattern syntax error: " + e.getMessage());
}
}

/** Wraps an existing JDK Pattern (for API compat); doesn't switch */
public static Regex fromPattern(Pattern pattern) {
return new Regex(pattern);
}

static boolean wantsRe2j() {
return Boolean.parseBoolean(System.getProperty(SharedConstants.UseRe2j, "true"));
}

static void wantsRe2j(boolean use) {
System.setProperty(SharedConstants.UseRe2j, Boolean.toString(use));
}

static boolean hasRe2j() {
try {
Class<?> re2 = Class.forName("com.google.re2j.Pattern", false, Regex.class.getClassLoader()); // check if re2j is in classpath
try {
// if it is, and we are on JVM9+, we need to dork around with modules, because re2j doesn't publish a module name.
// done via reflection so we can still run on JVM 8.
// todo remove if re2j publishes as a module
Class<?> moduleCls = Class.forName("java.lang.Module");
Method getModule = Class.class.getMethod("getModule");
Object jsoupMod = getModule.invoke(Regex.class);
Object re2Mod = getModule.invoke(re2);
boolean reads = (boolean) moduleCls.getMethod("canRead", moduleCls).invoke(jsoupMod, re2Mod);
if (!reads) moduleCls.getMethod("addReads", moduleCls).invoke(jsoupMod, re2Mod);
} catch (ClassNotFoundException ignore) {
// jvm8 - no Module class; so we can use as-is
}
return true;
} catch (ClassNotFoundException e) {
return false; // no re2j
} catch (ReflectiveOperationException e) {
// unexpectedly couldn’t wire modules on 9+; return false to avoid IllegalAccessError later
System.err.println("Warning: (bug? please report) couldn't access re2j from jsoup due to modules: " + e);
return false;
}
}

public Matcher matcher(CharSequence input) {
return new JdkMatcher(jdkPattern.matcher(input));
}

@Override
public String toString() {
return jdkPattern.toString();
}

public interface Matcher {
boolean find();
}

private static final class JdkMatcher implements Matcher {
private final java.util.regex.Matcher delegate;

JdkMatcher(java.util.regex.Matcher delegate) {
this.delegate = delegate;
}

@Override
public boolean find() {
return delegate.find();
}
}
}
2 changes: 2 additions & 0 deletions src/main/java/org/jsoup/internal/SharedConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,7 @@ public final class SharedConstants {

public static final String UseHttpClient = "jsoup.useHttpClient";

public static final String UseRe2j = "jsoup.useRe2j"; // enables use of the re2j regular expression engine when true and it's on the classpath

private SharedConstants() {}
}
20 changes: 10 additions & 10 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.QuietAppendable;
import org.jsoup.helper.Regex;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
Expand Down Expand Up @@ -1404,7 +1405,6 @@ public Elements getElementsByAttributeValueContaining(String key, String match)
*/
public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) {
return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);

}

/**
Expand All @@ -1414,13 +1414,13 @@ public Elements getElementsByAttributeValueMatching(String key, Pattern pattern)
* @return elements that have attributes matching this regular expression
*/
public Elements getElementsByAttributeValueMatching(String key, String regex) {
Pattern pattern;
Regex pattern;
try {
pattern = Pattern.compile(regex);
pattern = Regex.compile(regex);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
}
return getElementsByAttributeValueMatching(key, pattern);
return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
}

/**
Expand Down Expand Up @@ -1489,13 +1489,13 @@ public Elements getElementsMatchingText(Pattern pattern) {
* @see Element#text()
*/
public Elements getElementsMatchingText(String regex) {
Pattern pattern;
Regex pattern;
try {
pattern = Pattern.compile(regex);
pattern = Regex.compile(regex);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
}
return getElementsMatchingText(pattern);
return Collector.collect(new Evaluator.Matches(pattern), this);
}

/**
Expand All @@ -1515,13 +1515,13 @@ public Elements getElementsMatchingOwnText(Pattern pattern) {
* @see Element#ownText()
*/
public Elements getElementsMatchingOwnText(String regex) {
Pattern pattern;
Regex pattern;
try {
pattern = Pattern.compile(regex);
pattern = Regex.compile(regex);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
}
return getElementsMatchingOwnText(pattern);
return Collector.collect(new Evaluator.MatchesOwn(pattern), this);
}

/**
Expand Down
53 changes: 35 additions & 18 deletions src/main/java/org/jsoup/select/Evaluator.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;
import org.jsoup.parser.ParseSettings;
import org.jsoup.helper.Regex;

import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.jsoup.internal.Normalizer.lowerCase;
Expand Down Expand Up @@ -385,13 +385,17 @@ public String toString() {
*/
public static final class AttributeWithValueMatching extends Evaluator {
final String key;
final Pattern pattern;
final Regex pattern;

public AttributeWithValueMatching(String key, Pattern pattern) {
public AttributeWithValueMatching(String key, Regex pattern) {
this.key = normalize(key);
this.pattern = pattern;
}

public AttributeWithValueMatching(String key, Pattern pattern) {
this(key, Regex.fromPattern(pattern)); // api compat
}

@Override
public boolean matches(Element root, Element element) {
return element.hasAttr(key) && pattern.matcher(element.attr(key)).find();
Expand Down Expand Up @@ -924,16 +928,19 @@ public String toString() {
* Evaluator for matching Element (and its descendants) text with regex
*/
public static final class Matches extends Evaluator {
private final Pattern pattern;
private final Regex pattern;

public Matches(Pattern pattern) {
public Matches(Regex pattern) {
this.pattern = pattern;
}

public Matches(Pattern pattern) {
this(Regex.fromPattern(pattern));
}

@Override
public boolean matches(Element root, Element element) {
Matcher m = pattern.matcher(element.text());
return m.find();
return pattern.matcher(element.text()).find();
}

@Override protected int cost() {
Expand All @@ -950,16 +957,19 @@ public String toString() {
* Evaluator for matching Element's own text with regex
*/
public static final class MatchesOwn extends Evaluator {
private final Pattern pattern;
private final Regex pattern;

public MatchesOwn(Pattern pattern) {
public MatchesOwn(Regex pattern) {
this.pattern = pattern;
}

public MatchesOwn(Pattern pattern) {
this(Regex.fromPattern(pattern));
}

@Override
public boolean matches(Element root, Element element) {
Matcher m = pattern.matcher(element.ownText());
return m.find();
return pattern.matcher(element.ownText()).find();
}

@Override protected int cost() {
Expand All @@ -977,16 +987,19 @@ public String toString() {
* @since 1.15.1.
*/
public static final class MatchesWholeText extends Evaluator {
private final Pattern pattern;
private final Regex pattern;

public MatchesWholeText(Pattern pattern) {
public MatchesWholeText(Regex pattern) {
this.pattern = pattern;
}

public MatchesWholeText(Pattern pattern) {
this.pattern = Regex.fromPattern(pattern);
}

@Override
public boolean matches(Element root, Element element) {
Matcher m = pattern.matcher(element.wholeText());
return m.find();
return pattern.matcher(element.wholeText()).find();
}

@Override protected int cost() {
Expand All @@ -1004,15 +1017,19 @@ public String toString() {
* @since 1.15.1.
*/
public static final class MatchesWholeOwnText extends Evaluator {
private final Pattern pattern;
private final Regex pattern;

public MatchesWholeOwnText(Pattern pattern) {
public MatchesWholeOwnText(Regex pattern) {
this.pattern = pattern;
}

public MatchesWholeOwnText(Pattern pattern) {
this(Regex.fromPattern(pattern));
}

@Override
public boolean matches(Element root, Element element) {
Matcher m = pattern.matcher(element.wholeOwnText());
Regex.Matcher m = pattern.matcher(element.wholeOwnText());
return m.find();
}

Expand Down
Loading
Loading