Redesign url parsing to handle values with leading / and trailing special chars

irene-sheen-reef · web-flow · commit 5d5b5d318a07 · 2025-07-30T18:29:04.000+05:00
* Replace urlsplit with a new parser

* Replace class SplitB2Result with a namedtuple

* Remove uriparse.b2_urlsplit and simplify the code

* Handle path in parse_uri instead of _parse_b2_uri

* Move URI cleaning into _clean_uri
diff --git a/b2/_internal/_utils/uri.py b/b2/_internal/_utils/uri.py
@@ -10,8 +10,7 @@
 from __future__ import annotations
 
 import dataclasses
-import pathlib
-import urllib.parse
+import re
 from functools import singledispatchmethod
 from pathlib import Path
 from typing import Sequence
@@ -24,7 +23,10 @@
 )
 from b2sdk.v3.exception import B2Error
 
-from b2._internal._utils.python_compat import removeprefix
+_B2ID_PATTERN = re.compile(r'^b2id://(?P<file_id>[a-zA-Z0-9:_-]+)$', re.IGNORECASE)
+_B2_PATTERN = re.compile(r'^b2://(?P<bucket>[a-z0-9-]*)(?P<path>/.*)?$', re.IGNORECASE)
+_SCHEME_PATTERN = re.compile(r'(?P<scheme>[a-z0-9]*)://.*', re.IGNORECASE)
+_CONTROL_CHARACTERS_AND_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
 
 
 class B2URIBase:
@@ -92,10 +94,10 @@ def parse_uri(uri: str, *, allow_all_buckets: bool = False) -> Path | B2URI | B2
     """
     if not uri:
         raise ValueError('URI cannot be empty')
-    parsed = urllib.parse.urlsplit(uri)
-    if parsed.scheme == '':
-        return pathlib.Path(uri)
-    return _parse_b2_uri(uri, parsed, allow_all_buckets=allow_all_buckets)
+
+    if _SCHEME_PATTERN.fullmatch(_clean_uri(uri)):
+        return _parse_b2_uri(uri, allow_all_buckets=allow_all_buckets)
+    return Path(uri)
 
 
 def parse_b2_uri(
@@ -110,38 +112,48 @@ def parse_b2_uri(
     :return: B2 URI
     :raises ValueError: if the URI is invalid
     """
-    parsed = urllib.parse.urlsplit(uri)
-    return _parse_b2_uri(uri, parsed, allow_all_buckets=allow_all_buckets, allow_b2id=allow_b2id)
+    return _parse_b2_uri(uri, allow_all_buckets=allow_all_buckets, allow_b2id=allow_b2id)
+
+
+def _clean_uri(uri: str) -> str:
+    # Clean URI
+    uri = uri.lstrip(_CONTROL_CHARACTERS_AND_SPACE)
+    for i in ('\n', '\r', '\t'):
+        uri = uri.replace(i, '')
+    return uri
 
 
 def _parse_b2_uri(
     uri,
-    parsed: urllib.parse.SplitResult,
     *,
     allow_all_buckets: bool = False,
     allow_b2id: bool = True,
 ) -> B2URI | B2FileIdURI:
-    if parsed.scheme in ('b2', 'b2id'):
-        path = urllib.parse.urlunsplit(parsed._replace(scheme='', netloc=''))
-        if not parsed.netloc:
+    uri = _clean_uri(uri)
+    if uri.lower().startswith('b2://'):
+        match = _B2_PATTERN.fullmatch(uri)
+        if not match:
+            raise ValueError(f'Invalid B2 URI: {uri!r}')
+
+        bucket = match.group('bucket')
+        path = match.group('path')
+        if not bucket:
             if allow_all_buckets:
                 if path:
                     raise ValueError(
                         f"Invalid B2 URI: all buckets URI doesn't allow non-empty path, but {path!r} was provided"
                     )
                 return B2URI(bucket_name='')
-            raise ValueError(f'Invalid B2 URI: {uri!r}')
-        elif parsed.password or parsed.username:
-            raise ValueError(
-                'Invalid B2 URI: credentials passed using `user@password:` syntax is not supported in URI'
-            )
-
-        if parsed.scheme == 'b2':
-            return B2URI(bucket_name=parsed.netloc, path=removeprefix(path, '/'))
-        elif parsed.scheme == 'b2id' and allow_b2id:
-            return B2FileIdURI(file_id=parsed.netloc)
-    else:
-        raise ValueError(f'Unsupported URI scheme: {parsed.scheme!r}')
+        else:
+            return B2URI(bucket_name=bucket, path=path[1:] if path else '')
+    elif allow_b2id and uri.lower().startswith('b2id://'):
+        match = _B2ID_PATTERN.fullmatch(uri)
+        if match:
+            return B2FileIdURI(file_id=match.group('file_id'))
+    elif match := _SCHEME_PATTERN.fullmatch(uri):
+        raise ValueError(f'Unsupported URI scheme: {match.group("scheme")!r}')
+
+    raise ValueError(f'Invalid B2 URI: {uri!r}')
 
 
 class B2URIAdapter:
diff --git a/changelog.d/1090.fixed.md b/changelog.d/1090.fixed.md
@@ -0,0 +1 @@
+Handle filenames starting with / or ending with # or ?.
diff --git a/test/unit/_utils/test_uri.py b/test/unit/_utils/test_uri.py
@@ -61,13 +61,18 @@ def test_b2fileuri_str():
     [
         ('some/local/path', Path('some/local/path')),
         ('./some/local/path', Path('some/local/path')),
+        ('.', Path('')),
         ('b2://bucket', B2URI(bucket_name='bucket')),
+        (' b2://bucket', B2URI(bucket_name='bucket')),
         ('b2://bucket/', B2URI(bucket_name='bucket')),
         ('b2://bucket/path/to/dir/', B2URI(bucket_name='bucket', path='path/to/dir/')),
         ('b2id://file123', B2FileIdURI(file_id='file123')),
         ('b2://bucket/wild[card]', B2URI(bucket_name='bucket', path='wild[card]')),
         ('b2://bucket/wild?card', B2URI(bucket_name='bucket', path='wild?card')),
         ('b2://bucket/special#char', B2URI(bucket_name='bucket', path='special#char')),
+        ('b2://bucket/special#', B2URI(bucket_name='bucket', path='special#')),
+        ('b2://bucket/special?', B2URI(bucket_name='bucket', path='special?')),
+        ('b2://bucket//special', B2URI(bucket_name='bucket', path='/special')),
     ],
 )
 def test_parse_uri(uri, expected):
@@ -94,14 +99,15 @@ def test_parse_uri__allow_all_buckets():
         # Test cases for B2 URIs with credentials
         (
             'b2://user@password:bucket/path',
-            'Invalid B2 URI: credentials passed using `user@password:` syntax is not supported in URI',
+            "Invalid B2 URI: 'b2://user@password:bucket/path'",
         ),
         (
             'b2id://user@password:file123',
-            'Invalid B2 URI: credentials passed using `user@password:` syntax is not supported in URI',
+            "Invalid B2 URI: 'b2id://user@password:file123'",
         ),
         # Test cases for unsupported URI schemes
         ('unknown://bucket/path', "Unsupported URI scheme: 'unknown'"),
+        (' unknown://bucket/path', "Unsupported URI scheme: 'unknown'"),
     ],
 )
 def test_parse_uri_exceptions(uri, expected_exception_message):
diff --git a/test/unit/console_tool/test_download_file.py b/test/unit/console_tool/test_download_file.py
@@ -187,7 +187,7 @@ def test_cat__b2_uri__invalid(b2_cli, capfd):
         expected_stderr=None,
         expected_status=2,
     )
-    assert "argument B2_URI: Unsupported URI scheme: ''" in capfd.readouterr().err
+    assert "argument B2_URI: Invalid B2 URI: 'nothing/meaningful'" in capfd.readouterr().err
 
 
 def test_cat__b2_uri__not_a_file(b2_cli, bucket, capfd):

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Handle filenames starting with / or ending with # or ?.`
Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,7 @@ def test_cat__b2_uri__invalid(b2_cli, capfd):`
`187`	`187`	`expected_stderr=None,`
`188`	`188`	`expected_status=2,`
`189`	`189`	`)`
`190`		`- assert "argument B2_URI: Unsupported URI scheme: ''" in capfd.readouterr().err`
	`190`	`+ assert "argument B2_URI: Invalid B2 URI: 'nothing/meaningful'" in capfd.readouterr().err`
`191`	`191`
`192`	`192`
`193`	`193`	`def test_cat__b2_uri__not_a_file(b2_cli, bucket, capfd):`