diff --git a/conformance/expected-failures.yaml b/conformance/expected-failures.yaml index 5d92bdca..0db100ef 100644 --- a/conformance/expected-failures.yaml +++ b/conformance/expected-failures.yaml @@ -107,96 +107,6 @@ custom_constraints: #ERROR: :1:1: expression of type 'int' cannot be range of a comprehension (must be list, map, or dynamic) # | this.all(e, e == 1) # | ^ -library/is_uri: - - invalid/host/c - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://foo@你好.com"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - invalid/host_ipv6/a - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://2001:0db8:85a3:0000:0000:8a2e:0370:7334"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - invalid/host_ipv6_zone-id_empty - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://[::1%25]"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - invalid/host_ipv6_zone-id_unquoted - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://[::1%eth0]"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - invalid/host_reg-name_pct-encoded_invalid_utf8 - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://foo%c3x%96"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - invalid/port/a - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://example.com:8a"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - invalid/port/b - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://example.com:x"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - invalid/userinfo_reserved_at - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://@@example.com"} - # want: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # got: valid - - valid/host_ipfuture_exhaust - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://[vF.-!$&'()*+,;=._~0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]"} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # message: "" - - valid/host_ipfuture_long - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://[v1234AF.x]"} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # message: "" - - valid/host_ipfuture_short - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://[v1.x]"} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # message: "" - - valid/host_ipv6_zone-id_pct-encoded_ascii - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://[::1%25foo%61%20%23]"} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # message: "" - - valid/host_ipv6_zone-id_pct-encoded_utf8 - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"https://[::1%25foo%c3%96]"} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # message: "" - - valid/path-empty - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUri]:{val:"foo:"} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri" - # message: "" -library/is_uri_ref: - - valid/empty_string - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUriRef]:{} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri_ref" - # message: "" - - valid/path-empty - # input: [type.googleapis.com/buf.validate.conformance.cases.IsUriRef]:{} - # want: valid - # got: validation error (1 violation) - # 1. constraint_id: "library.is_uri_ref" - # message: "" standard_constraints/ignore: - proto/2023/map/ignore_always/invalid/populated # input: [type.googleapis.com/buf.validate.conformance.cases.EditionsMapIgnoreAlways]:{val:{key:1 value:1}} diff --git a/src/main/java/build/buf/protovalidate/CustomOverload.java b/src/main/java/build/buf/protovalidate/CustomOverload.java index 889b0f78..94cc4819 100644 --- a/src/main/java/build/buf/protovalidate/CustomOverload.java +++ b/src/main/java/build/buf/protovalidate/CustomOverload.java @@ -15,8 +15,6 @@ package build.buf.protovalidate; import com.google.common.primitives.Bytes; -import java.net.URI; -import java.net.URISyntaxException; import java.util.HashSet; import java.util.Locale; import java.util.Set; @@ -62,19 +60,19 @@ final class CustomOverload { */ static Overload[] create() { return new Overload[] { - format(), - unique(), - startsWith(), - endsWith(), - contains(), + celFormat(), + celUnique(), + celStartsWith(), + celEndsWith(), + celContains(), celIsHostname(), celIsEmail(), celIsIp(), celIsIpPrefix(), celIsUri(), celIsUriRef(), - isNan(), - isInf(), + celIsNan(), + celIsInf(), celIsHostAndPort(), }; } @@ -84,7 +82,7 @@ static Overload[] create() { * * @return The {@link Overload} instance for the "format" operation. */ - private static Overload format() { + private static Overload celFormat() { return Overload.binary( OVERLOAD_FORMAT, (lhs, rhs) -> { @@ -106,7 +104,7 @@ private static Overload format() { * * @return The {@link Overload} instance for the "unique" operation. */ - private static Overload unique() { + private static Overload celUnique() { return Overload.unary( OVERLOAD_UNIQUE, (val) -> { @@ -122,7 +120,7 @@ private static Overload unique() { * * @return The {@link Overload} instance for the "startsWith" operation. */ - private static Overload startsWith() { + private static Overload celStartsWith() { return Overload.binary( OVERLOAD_STARTS_WITH, (lhs, rhs) -> { @@ -157,7 +155,7 @@ private static Overload startsWith() { * * @return The {@link Overload} instance for the "endsWith" operation. */ - private static Overload endsWith() { + private static Overload celEndsWith() { return Overload.binary( OVERLOAD_ENDS_WITH, (lhs, rhs) -> { @@ -192,7 +190,7 @@ private static Overload endsWith() { * * @return The {@link Overload} instance for the "contains" operation. */ - private static Overload contains() { + private static Overload celContains() { return Overload.binary( OVERLOAD_CONTAINS, (lhs, rhs) -> { @@ -262,14 +260,14 @@ private static Overload celIsIp() { return Err.noSuchOverload(value, OVERLOAD_IS_IP, null); } String addr = (String) value.value(); - return Types.boolOf(isIP(addr, 0L)); + return Types.boolOf(isIp(addr, 0L)); }, (lhs, rhs) -> { if (lhs.type().typeEnum() != TypeEnum.String || rhs.type().typeEnum() != TypeEnum.Int) { return Err.noSuchOverload(lhs, OVERLOAD_IS_IP, rhs); } String address = (String) lhs.value(); - return Types.boolOf(isIP(address, rhs.intValue())); + return Types.boolOf(isIp(address, rhs.intValue())); }, null); } @@ -289,7 +287,7 @@ private static Overload celIsIpPrefix() { return Err.noSuchOverload(value, OVERLOAD_IS_IP_PREFIX, null); } String prefix = (String) value.value(); - return Types.boolOf(isIPPrefix(prefix, 0L, false)); + return Types.boolOf(isIpPrefix(prefix, 0L, false)); }, (lhs, rhs) -> { if (lhs.type().typeEnum() != TypeEnum.String @@ -299,9 +297,9 @@ private static Overload celIsIpPrefix() { } String prefix = (String) lhs.value(); if (rhs.type().typeEnum() == TypeEnum.Int) { - return Types.boolOf(isIPPrefix(prefix, rhs.intValue(), false)); + return Types.boolOf(isIpPrefix(prefix, rhs.intValue(), false)); } - return Types.boolOf(isIPPrefix(prefix, 0L, rhs.booleanValue())); + return Types.boolOf(isIpPrefix(prefix, 0L, rhs.booleanValue())); }, (values) -> { if (values.length != 3 @@ -311,7 +309,7 @@ private static Overload celIsIpPrefix() { return Err.noSuchOverload(values[0], OVERLOAD_IS_IP_PREFIX, "", values); } String prefix = (String) values[0].value(); - return Types.boolOf(isIPPrefix(prefix, values[1].intValue(), values[2].booleanValue())); + return Types.boolOf(isIpPrefix(prefix, values[1].intValue(), values[2].booleanValue())); }); } @@ -328,10 +326,7 @@ private static Overload celIsUri() { return Err.noSuchOverload(value, OVERLOAD_IS_URI, null); } String addr = (String) value.value(); - if (addr.isEmpty()) { - return BoolT.False; - } - return Types.boolOf(validateURI(addr, true)); + return Types.boolOf(isUri(addr)); }); } @@ -348,10 +343,7 @@ private static Overload celIsUriRef() { return Err.noSuchOverload(value, OVERLOAD_IS_URI_REF, null); } String addr = (String) value.value(); - if (addr.isEmpty()) { - return BoolT.False; - } - return Types.boolOf(validateURI(addr, false)); + return Types.boolOf(isUriRef(addr)); }); } @@ -360,7 +352,7 @@ private static Overload celIsUriRef() { * * @return The {@link Overload} instance for the "isNan" operation. */ - private static Overload isNan() { + private static Overload celIsNan() { return Overload.unary( OVERLOAD_IS_NAN, value -> { @@ -377,7 +369,7 @@ private static Overload isNan() { * * @return The {@link Overload} instance for the "isInf" operation. */ - private static Overload isInf() { + private static Overload celIsInf() { return Overload.overload( OVERLOAD_IS_INF, null, @@ -448,21 +440,21 @@ private static boolean isHostAndPort(String str, boolean portRequired) { int endPlus = end + 1; if (endPlus == str.length()) { // no port - return !portRequired && isIP(str.substring(1, end), 6); + return !portRequired && isIp(str.substring(1, end), 6); } else if (endPlus == splitIdx) { // port - return isIP(str.substring(1, end), 6) && isPort(str.substring(splitIdx + 1)); + return isIp(str.substring(1, end), 6) && isPort(str.substring(splitIdx + 1)); } return false; // malformed } if (splitIdx < 0) { - return !portRequired && (isHostname(str) || isIP(str, 4)); + return !portRequired && (isHostname(str) || isIp(str, 4)); } String host = str.substring(0, splitIdx); String port = str.substring(splitIdx + 1); - return ((isHostname(host) || isIP(host, 4)) && isPort(port)); + return ((isHostname(host) || isIp(host, 4)) && isPort(port)); } // Returns true if the string is a valid port for isHostAndPort. @@ -606,7 +598,7 @@ private static boolean isHostname(String val) { *
Both formats are well-defined in the internet standard RFC 3986. Zone identifiers for IPv6 * addresses (for example "fe80::a%en1") are supported. */ - private static boolean isIP(String addr, long ver) { + static boolean isIp(String addr, long ver) { if (ver == 6L) { return new Ipv6(addr).address(); } else if (ver == 4L) { @@ -618,22 +610,24 @@ private static boolean isIP(String addr, long ver) { } /** - * Validates if the input string is a valid URI, which can be a URL or a URN. + * Returns true if the string is a URI, for example "https://example.com/foo/bar?baz=quux#frag". * - * @param val The input string to validate as a URI. - * @param checkAbsolute Whether to check if this URI is absolute (i.e. has a scheme component) - * @return {@code true} if the input string is a valid URI, {@code false} otherwise. + *
URI is defined in the internet standard RFC 3986. Zone Identifiers in IPv6 address literals + * are supported (RFC 6874). */ - private static boolean validateURI(String val, boolean checkAbsolute) { - try { - URI uri = new URI(val); - if (checkAbsolute) { - return uri.isAbsolute(); - } - return true; - } catch (URISyntaxException e) { - return false; - } + private static boolean isUri(String str) { + return new Uri(str).uri(); + } + + /** + * Returns true if the string is a URI Reference - a URI such as + * "https://example.com/foo/bar?baz=quux#frag", or a Relative Reference such as "./foo/bar?query". + * + *
URI, URI Reference, and Relative Reference are defined in the internet standard RFC 3986. + * Zone Identifiers in IPv6 address literals are supported (RFC 6874). + */ + private static boolean isUriRef(String str) { + return new Uri(str).uriReference(); } /** @@ -653,7 +647,7 @@ private static boolean validateURI(String val, boolean checkAbsolute) { *
The same principle applies to IPv4 addresses. "192.168.1.0/24" designates the first 24 bits * of the 32-bit IPv4 as the network prefix. */ - private static boolean isIPPrefix(String str, long version, boolean strict) { + private static boolean isIpPrefix(String str, long version, boolean strict) { if (version == 6L) { Ipv6 ip = new Ipv6(str); return ip.addressPrefix() && (!strict || ip.isPrefixOnly()); @@ -661,7 +655,7 @@ private static boolean isIPPrefix(String str, long version, boolean strict) { Ipv4 ip = new Ipv4(str); return ip.addressPrefix() && (!strict || ip.isPrefixOnly()); } else if (version == 0L) { - return isIPPrefix(str, 6, strict) || isIPPrefix(str, 4, strict); + return isIpPrefix(str, 6, strict) || isIpPrefix(str, 4, strict); } return false; } diff --git a/src/main/java/build/buf/protovalidate/Ipv4.java b/src/main/java/build/buf/protovalidate/Ipv4.java index 52f249a3..c3726b64 100644 --- a/src/main/java/build/buf/protovalidate/Ipv4.java +++ b/src/main/java/build/buf/protovalidate/Ipv4.java @@ -17,6 +17,9 @@ import java.util.ArrayList; import java.util.List; +/** + * Ipv4 is a class used to parse a given string to determine if it is an IPv4 address or address prefix. + */ final class Ipv4 { private String str; private int index; @@ -76,7 +79,7 @@ boolean addressPrefix() { && this.index == this.str.length(); } - // Stores value in `prefixLen` + // Store value in prefixLen private boolean prefixLength() { int start = this.index; @@ -169,9 +172,9 @@ private boolean decOctet() { } /** - * Reports whether the current position is a digit. + * Determines whether the current position is a digit. * - *
Method parses the rule: + *
Parses the rule: * *
DIGIT = %x30-39 ; 0-9
*/
@@ -185,9 +188,7 @@ private boolean digit() {
}
/**
- * Take the given char at the current index.
- *
- * If char is at the current index, increment the index.
+ * Take the given char at the current position, incrementing the index if necessary.
*/
private boolean take(char c) {
if (this.index >= this.str.length()) {
diff --git a/src/main/java/build/buf/protovalidate/Ipv6.java b/src/main/java/build/buf/protovalidate/Ipv6.java
index aa178943..db14eb68 100644
--- a/src/main/java/build/buf/protovalidate/Ipv6.java
+++ b/src/main/java/build/buf/protovalidate/Ipv6.java
@@ -18,6 +18,9 @@
import java.util.List;
import javax.annotation.Nullable;
+/**
+ * Ipv6 is a class used to parse a given string to determine if it is an IPv6 address or address prefix.
+ */
final class Ipv6 {
private String str;
private int index;
@@ -121,7 +124,7 @@ boolean addressPrefix() {
&& this.index == this.str.length();
}
- // Stores value in `prefixLen`
+ // Stores value in prefixLen
private boolean prefixLength() {
int start = this.index;
@@ -159,7 +162,7 @@ private boolean prefixLength() {
}
}
- // Stores dotted notation for right-most 32 bits in `dottedRaw` / `dottedAddr` if found.
+ // Stores dotted notation for right-most 32 bits in dottedRaw / dottedAddr if found.
private boolean addressPart() {
while (this.index < this.str.length()) {
// dotted notation for right-most 32 bits, e.g. 0:0:0:0:0:ffff:192.1.56.10
@@ -227,9 +230,9 @@ private boolean zoneID() {
}
/**
- * Determines whether string contains a dotted address.
+ * Determines whether the current position is a dotted address.
*
- *
Method parses the rule:
+ *
Parses the rule:
*
*
1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
*
@@ -254,9 +257,9 @@ private boolean dotted() {
}
/**
- * Determine whether string contains an h16.
+ * Determines whether the current position is an h16.
*
- * Method parses the rule:
+ *
Parses the rule:
*
*
h16 = 1*4HEXDIG
*
@@ -291,9 +294,9 @@ private boolean h16() {
}
/**
- * Reports whether the current position is a hex digit.
+ * Determines whether the current position is a hex digit.
*
- * Method parses the rule:
+ *
Parses the rule:
*
*
HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
*/
@@ -310,9 +313,9 @@ private boolean hexDig() {
}
/**
- * Reports whether the current position is a digit.
+ * Determines whether the current position is a digit.
*
- * Method parses the rule:
+ *
Parses the rule:
*
*
DIGIT = %x30-39 ; 0-9
*/
@@ -326,9 +329,7 @@ private boolean digit() {
}
/**
- * Take the given char at the current index.
- *
- * If char is at the current index, increment the index.
+ * Take the given char at the current position, incrementing the index if necessary.
*/
private boolean take(char c) {
if (this.index >= this.str.length()) {
diff --git a/src/main/java/build/buf/protovalidate/Uri.java b/src/main/java/build/buf/protovalidate/Uri.java
new file mode 100644
index 00000000..63287135
--- /dev/null
+++ b/src/main/java/build/buf/protovalidate/Uri.java
@@ -0,0 +1,949 @@
+// Copyright 2023-2024 Buf Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package build.buf.protovalidate;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Ipv6 is a class used to parse a given string to determine if it is a URI or URI reference.
+ */
+final class Uri {
+ private String str;
+ private int index;
+ private boolean pctEncodedFound;
+
+ Uri(String str) {
+ this.str = str;
+ }
+
+ /**
+ * Determines whether string is a valid URI.
+ *
+ *
Parses the rule:
+ *
+ *
URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+ */
+ boolean uri() {
+ int start = this.index;
+
+ if (!(this.scheme() && this.take(':') && this.hierPart())) {
+ this.index = start;
+ return false;
+ }
+
+ if (this.take('?') && !this.query()) {
+ return false;
+ }
+
+ if (this.take('#') && !this.fragment()) {
+ return false;
+ }
+
+ if (this.index != this.str.length()) {
+ this.index = start;
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Determines whether the current position is a valid hier-part.
+ *
+ * Parses the rule:
+ *
+ *
hier-part = "//" authority path-abempty
+ * / path-absolute
+ * / path-rootless
+ * / path-empty
+ */
+ private boolean hierPart() {
+ int start = this.index;
+
+ if (this.takeDoubleSlash() && this.authority() && this.pathAbempty()) {
+ return true;
+ }
+
+ this.index = start;
+
+ return this.pathAbsolute() || this.pathRootless() || this.pathEmpty();
+ }
+
+ /**
+ * Determines whether string is a valid URI reference.
+ *
+ * Parses the rule:
+ *
+ *
URI-reference = URI / relative-ref
+ */
+ boolean uriReference() {
+ return this.uri() || this.relativeRef();
+ }
+
+ /**
+ * Determines whether the current position is a valid relative reference.
+ *
+ * Parses the rule:
+ *
+ *
relative-ref = relative-part [ "?" query ] [ "#" fragment ].
+ */
+ private boolean relativeRef() {
+ int start = this.index;
+
+ if (!this.relativePart()) {
+ return false;
+ }
+
+ if (this.take('?') && !this.query()) {
+ this.index = start;
+ return false;
+ }
+
+ if (this.take('#') && !this.fragment()) {
+ this.index = start;
+ return false;
+ }
+
+ if (this.index != this.str.length()) {
+ this.index = start;
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Determines whether the current position is a valid relative part.
+ *
+ * Parses the rule:
+ *
+ *
relative-part = "//" authority path-abempty
+ * / path-absolute
+ * / path-noscheme
+ * / path-empty
+ */
+ private boolean relativePart() {
+ int start = this.index;
+
+ if (this.takeDoubleSlash() && this.authority() && this.pathAbempty()) {
+ return true;
+ }
+
+ this.index = start;
+
+ return this.pathAbsolute() || this.pathNoscheme() || this.pathEmpty();
+ }
+
+ private boolean takeDoubleSlash() {
+ boolean isSlash = take('/');
+
+ return isSlash && take('/');
+ }
+
+ /**
+ * Determines whether the current position is a valid scheme.
+ *
+ * Parses the rule:
+ *
+ *
scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+ */
+ private boolean scheme() {
+ int start = this.index;
+
+ if (this.alpha()) {
+ while (this.alpha() || this.digit() || this.take('+') || this.take('-') || this.take('.')) {}
+
+ if (this.str.charAt(this.index) == ':') {
+ return true;
+ }
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid authority.
+ *
+ * Parses the rule:
+ *
+ *
authority = [ userinfo "@" ] host [ ":" port ]
+ *
+ * Lead by double slash ("") and terminated by "/", "?", "#", or end of URI.
+ */
+ private boolean authority() {
+ int start = this.index;
+
+ if (this.userinfo()) {
+ if (!this.take('@')) {
+ this.index = start;
+ return false;
+ }
+ }
+
+ if (!this.host()) {
+ this.index = start;
+ return false;
+ }
+
+ if (this.take(':')) {
+ if (!this.port()) {
+ this.index = start;
+ return false;
+ }
+ }
+
+ if (!this.isAuthorityEnd()) {
+ this.index = start;
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Determines whether the current position is the end of the authority.
+ *
+ * The authority component [...] is terminated by one of the following:
+ *
+ *
Parses the rule: + * + *
userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+ *
+ * Terminated by "@" in authority.
+ */
+ private boolean userinfo() {
+ int start = this.index;
+
+ while (true) {
+ if (this.unreserved() || this.pctEncoded() || this.subDelims() || this.take(':')) {
+ continue;
+ }
+
+ if (this.index < this.str.length()) {
+ if (this.str.charAt(this.index) == '@') {
+ return true;
+ }
+ }
+
+ this.index = start;
+
+ return false;
+ }
+ }
+
+ private static int unhex(char c) {
+ if ('0' <= c && c <= '9') {
+ return c - '0';
+ } else if ('a' <= c && c <= 'f') {
+ return c - 'a' + 10;
+ } else if ('A' <= c && c <= 'F') {
+ return c - 'A' + 10;
+ }
+
+ return 0;
+ }
+
+ /**
+ * Verifies that str is correctly percent-encoded.
+ *
+ * Note that we essentially want to mimic the behavior of decodeURIComponent, which would fail
+ * on malformed URLs. Java does have various methods for decoding URLs, but none behave
+ * consistently with decodeURIComponent.
+ *
+ *
The code below is a combination of `checkHostPctEncoded` from the protovalidate-go
+ * implementation and Java's java.net.URI#decode methods.
+ */
+ private boolean checkHostPctEncoded(String str) {
+ CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
+
+ int strLen = str.length();
+ ByteBuffer buffer = ByteBuffer.allocate(strLen);
+ CharBuffer out = CharBuffer.allocate(strLen);
+
+ // Unhex str and convert to a ByteBuffer.
+ for (int i = 0; i < str.length(); ) {
+ if (str.charAt(i) == '%') {
+ // If we encounter a %, unhex the two following digits, extract their
+ // last 4 bits, cast to a byte.
+ byte b =
+ (byte)
+ (((unhex(str.charAt(i + 1)) & 0xf) << 4) | ((unhex(str.charAt(i + 2)) & 0xf) << 0));
+ buffer.put(b);
+ i += 3;
+ } else {
+ // Not percent encoded, extract the last 4 bits, convert to a byte
+ // and add to the byte buffer.
+ buffer.put((byte) (str.charAt(i) & 0xf));
+ i++;
+ }
+ }
+
+ // Attempt to decode the byte buffer as UTF-8.
+ CoderResult f = decoder.decode((ByteBuffer) buffer.flip(), out, true);
+
+ // If an error occurred, return false as invalid.
+ if (f.isError()) {
+ return false;
+ }
+ // Flush the buffer
+ f = decoder.flush(out);
+
+ // If an error occurred, return false as invalid.
+ // Otherwise return true.
+ return !f.isError();
+ }
+
+ /**
+ * Determines whether the current position is a valid host.
+ *
+ *
Parses the rule:
+ *
+ *
host = IP-literal / IPv4address / reg-name.
+ */
+ private boolean host() {
+ if (this.index >= this.str.length()) {
+ return true;
+ }
+
+ int start = this.index;
+ this.pctEncodedFound = false;
+
+ // Note: IPv4address is a subset of reg-name
+ if ((this.str.charAt(this.index) == '[' && this.ipLiteral()) || this.regName()) {
+ if (this.pctEncodedFound) {
+ String rawHost = this.str.substring(start, this.index);
+ // RFC 3986:
+ // > URI producing applications must not use percent-encoding in host
+ // > unless it is used to represent a UTF-8 character sequence.
+ if (!this.checkHostPctEncoded(rawHost)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid port.
+ *
+ * Parses the rule:
+ *
+ *
port = *DIGIT
+ *
+ * Terminated by end of authority.
+ */
+ private boolean port() {
+ int start = this.index;
+
+ while (true) {
+ if (this.digit()) {
+ continue;
+ }
+
+ if (this.isAuthorityEnd()) {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+ }
+
+ /**
+ * Determines whether the current position is a valid IP literal.
+ *
+ * Parses the rule from RFC 6874:
+ *
+ *
IP-literal = "[" ( IPv6address / IPv6addrz / IPvFuture ) "]"
+ */
+ private boolean ipLiteral() {
+ int start = this.index;
+
+ if (this.take('[')) {
+ int j = this.index;
+
+ if (this.ipv6Address() && this.take(']')) {
+ return true;
+ }
+
+ this.index = j;
+
+ if (this.ipv6Addrz() && this.take(']')) {
+ return true;
+ }
+
+ this.index = j;
+
+ if (this.ipvFuture() && this.take(']')) {
+ return true;
+ }
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid ipv6 address.
+ *
+ * ipv6Address parses the rule "IPv6address".
+ *
+ *
Relies on the implementation of isIp.
+ */
+ private boolean ipv6Address() {
+ int start = this.index;
+
+ while (this.hexDig() || this.take(':')) {}
+
+ if (CustomOverload.isIp(this.str.substring(start, this.index), 6)) {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid IPv6addrz.
+ *
+ * Parses the rule:
+ *
+ *
IPv6addrz = IPv6address "%25" ZoneID
+ */
+ private boolean ipv6Addrz() {
+ int start = this.index;
+
+ if (this.ipv6Address() && this.take('%') && this.take('2') && this.take('5') && this.zoneID()) {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid zone ID.
+ *
+ * Parses the rule:
+ *
+ * ZoneID = 1*( unreserved / pct-encoded )
+ */
+ private boolean zoneID() {
+ int start = this.index;
+
+ while (this.unreserved() || this.pctEncoded()) {}
+
+ if (this.index - start > 0) {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid IPvFuture.
+ *
+ * Parses the rule:
+ *
+ * IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+ */
+ private boolean ipvFuture() {
+ int start = this.index;
+
+ if (this.take('v') && this.hexDig()) {
+ while (this.hexDig()) {}
+
+ if (this.take('.')) {
+ int j = 0;
+
+ while (this.unreserved() || this.subDelims() || this.take(':')) {
+ j++;
+ }
+
+ if (j >= 1) {
+ return true;
+ }
+ }
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid reg-name.
+ *
+ * Parses the rule:
+ *
+ * reg-name = *( unreserved / pct-encoded / sub-delims )
+ *
+ * Terminates on start of port (":") or end of authority.
+ */
+ private boolean regName() {
+ int start = this.index;
+
+ while (true) {
+ if (this.unreserved() || this.pctEncoded() || this.subDelims()) {
+ continue;
+ }
+
+ if (this.isAuthorityEnd()) {
+ // End of authority
+ return true;
+ }
+
+ if (this.str.charAt(this.index) == ':') {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+ }
+
+ /**
+ * Determines whether the current position is the end of the path.
+ *
+ * The path is terminated by one of the following:
+ *
+ *
path-abempty = *( "/" segment )
+ *
+ * Terminated by end of path: "?", "#", or end of URI.
+ */
+ private boolean pathAbempty() {
+ int start = this.index;
+
+ while (this.take('/') && this.segment()) {}
+
+ if (this.isPathEnd()) {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid path-absolute.
+ *
+ * Parses the rule:
+ *
+ * path-absolute = "/" [ segment-nz *( "/" segment ) ]
+ *
+ * Terminated by end of path: "?", "#", or end of URI.
+ */
+ private boolean pathAbsolute() {
+ int start = this.index;
+
+ if (this.take('/')) {
+ if (this.segmentNz()) {
+ while (this.take('/') && this.segment()) {}
+ }
+
+ if (this.isPathEnd()) {
+ return true;
+ }
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid path-noscheme.
+ *
+ * Parses the rule:
+ *
+ * path-noscheme = segment-nz-nc *( "/" segment )
+ *
+ * Terminated by end of path: "?", "#", or end of URI.
+ */
+ private boolean pathNoscheme() {
+ int start = this.index;
+
+ if (this.segmentNzNc()) {
+ while (this.take('/') && this.segment()) {}
+
+ if (this.isPathEnd()) {
+ return true;
+ }
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid path-rootless.
+ *
+ * Parses the rule:
+ *
+ * path-rootless = segment-nz *( "/" segment )
+ *
+ * Terminated by end of path: "?", "#", or end of URI.
+ */
+ private boolean pathRootless() {
+ int start = this.index;
+
+ if (this.segmentNz()) {
+ while (this.take('/') && this.segment()) {}
+
+ if (this.isPathEnd()) {
+ return true;
+ }
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid path-empty.
+ *
+ * Parses the rule:
+ *
+ * path-empty = 0
+ *
+ * Terminated by end of path: "?", "#", or end of URI.
+ */
+ private boolean pathEmpty() {
+ return this.isPathEnd();
+ }
+
+ /**
+ * Determines whether the current position is a valid segment.
+ *
+ * Parses the rule:
+ *
+ * segment = *pchar
+ */
+ private boolean segment() {
+ while (this.pchar()) {}
+
+ return true;
+ }
+
+ /**
+ * Determines whether the current position is a valid segment-nz.
+ *
+ * Parses the rule:
+ *
+ * segment-nz = 1*pchar
+ */
+ private boolean segmentNz() {
+ int start = this.index;
+
+ if (this.pchar()) {
+ while (this.pchar()) {}
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid segment-nz-nc.
+ *
+ * Parses the rule:
+ *
+ * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+ * ; non-zero-length segment without any colon ":"
+ */
+ private boolean segmentNzNc() {
+ int start = this.index;
+
+ while (this.unreserved() || this.pctEncoded() || this.subDelims() || this.take('@')) {}
+
+ if (this.index - start > 0) {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a valid pchar.
+ *
+ * Parses the rule:
+ *
+ * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
+ */
+ private boolean pchar() {
+ return (this.unreserved()
+ || this.pctEncoded()
+ || this.subDelims()
+ || this.take(':')
+ || this.take('@'));
+ }
+
+ /**
+ * Determines whether the current position is a valid query.
+ *
+ * Parses the rule:
+ *
+ * query = *( pchar / "/" / "?" )
+ *
+ * Terminated by "#" or end of URI.
+ */
+ private boolean query() {
+ int start = this.index;
+
+ while (true) {
+ if (this.pchar() || this.take('/') || this.take('?')) {
+ continue;
+ }
+
+ if (this.index == this.str.length() || this.str.charAt(this.index) == '#') {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+ }
+
+ /**
+ * Determines whether the current position is a valid fragment.
+ *
+ * Parses the rule:
+ *
+ * fragment = *( pchar / "/" / "?" )
+ *
+ * Terminated by end of URI.
+ */
+ private boolean fragment() {
+ int start = this.index;
+
+ while (true) {
+ if (this.pchar() || this.take('/') || this.take('?')) {
+ continue;
+ }
+
+ if (this.index == this.str.length()) {
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+ }
+
+ /**
+ * Determines whether the current position is a valid pct-encoded.
+ *
+ * Parses the rule:
+ *
+ * pct-encoded = "%"+HEXDIG+HEXDIG
+ *
+ * Sets `pctEncodedFound` to true if a valid triplet was found.
+ */
+ private boolean pctEncoded() {
+ int start = this.index;
+
+ if (this.take('%') && this.hexDig() && this.hexDig()) {
+ this.pctEncodedFound = true;
+
+ return true;
+ }
+
+ this.index = start;
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is an unreserved character.
+ *
+ * Parses the rule:
+ *
+ * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+ */
+ private boolean unreserved() {
+ return (this.alpha()
+ || this.digit()
+ || this.take('-')
+ || this.take('_')
+ || this.take('.')
+ || this.take('~'));
+ }
+
+ /**
+ * Determines whether the current position is a sub-delim.
+ *
+ * Parses the rule:
+ *
+ * sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
+ * / "*" / "+" / "," / ";" / "="
+ */
+ private boolean subDelims() {
+ return (this.take('!')
+ || this.take('$')
+ || this.take('&')
+ || this.take('\'')
+ || this.take('(')
+ || this.take(')')
+ || this.take('*')
+ || this.take('+')
+ || this.take(',')
+ || this.take(';')
+ || this.take('='));
+ }
+
+ /**
+ * Determines whether the current position is an alpha character.
+ *
+ * Parses the rule:
+ *
+ * ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
+ */
+ private boolean alpha() {
+ if (this.index >= this.str.length()) {
+ return false;
+ }
+
+ char c = this.str.charAt(this.index);
+
+ if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) {
+ this.index++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a hex digit.
+ *
+ * Parses the rule:
+ *
+ *
HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+ */
+ private boolean hexDig() {
+ if (this.index >= this.str.length()) {
+ return false;
+ }
+
+ char c = this.str.charAt(this.index);
+
+ if (('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
+ this.index++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Determines whether the current position is a digit.
+ *
+ * Parses the rule:
+ *
+ *
DIGIT = %x30-39 ; 0-9
+ */
+ private boolean digit() {
+ if (this.index >= this.str.length()) {
+ return false;
+ }
+
+ char c = this.str.charAt(this.index);
+ if ('0' <= c && c <= '9') {
+ this.index++;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Take the given char at the current position, incrementing the index if necessary.
+ */
+ private boolean take(char c) {
+ if (this.index >= this.str.length()) {
+ return false;
+ }
+
+ if (this.str.charAt(this.index) == c) {
+ this.index++;
+ return true;
+ }
+
+ return false;
+ }
+}