Guard against wide-domain checks

sigmavirus24 · sigmavirus24 · commit a1a69e1df00a · 2020-04-07T07:59:59.000-05:00
Previously, we would parse the malicious URL example with an authority of 'user:pass@xdavidhu.me\\test.corp.google.com:8080' but we would not parse that into its components because it had invalid characters. So accessing the `host` attribute would result in `None`. That said, someone might still have used the `authority` attribute and been misled. To avoid misuse by developers, let's parse this similarly to the fix in the blog post. See also: - https://bugs.xdavidhu.me/google/2020/03/08/the-unexpected-google-wide-domain-check-bypass/
diff --git a/src/rfc3986/abnf_regexp.py b/src/rfc3986/abnf_regexp.py
@@ -39,7 +39,7 @@
 # than appear in Appendix B for scheme. This will prevent over-eager
 # consuming of items that aren't schemes.
 SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*'
-_AUTHORITY_RE = '[^/?#]*'
+_AUTHORITY_RE = '[^\\\\/?#]*'
 _PATH_RE = '[^?#]*'
 _QUERY_RE = '[^#]*'
 _FRAGMENT_RE = '.*'
diff --git a/tests/test_uri.py b/tests/test_uri.py
@@ -351,3 +351,15 @@ def test_empty_querystrings_persist():
     ref = URIReference.from_string(url)
     assert ref.query == ''
     assert ref.unsplit() == url
+
+
+def test_wide_domain_bypass_check():
+    """Verify we properly parse/handle the authority.
+
+    See also:
+    https://bugs.xdavidhu.me/google/2020/03/08/the-unexpected-google-wide-domain-check-bypass/
+    """
+    url = "https://user:pass@xdavidhu.me\\test.corp.google.com:8080/path/to/something?param=value#hash"
+    ref = URIReference.from_string(url)
+    assert ref.scheme == "https"
+    assert ref.host == "xdavidhu.me"