Skip to content

Commit 63315f3

Browse files
authored
feat: simplify object path split (#1028)
* simplify object path split * add example from #975 * fix tests * add more test cases * test case update * remove scheme unused regex
1 parent 59b6496 commit 63315f3

File tree

4 files changed

+79
-73
lines changed

4 files changed

+79
-73
lines changed

src/uproot/_util.py

Lines changed: 14 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import itertools
1313
import numbers
1414
import os
15-
import pathlib
1615
import platform
1716
import re
1817
import warnings
@@ -275,14 +274,10 @@ def regularize_path(path):
275274
_windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]")
276275
_windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]")
277276

277+
# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs)
278278
_remote_schemes = ["root", "s3", "http", "https"]
279279
_schemes = list({*_remote_schemes, *fsspec.available_protocols()})
280280

281-
_uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://")
282-
_uri_scheme_chain = re.compile(
283-
"^(" + "|".join([re.escape(x) for x in _schemes]) + ")::"
284-
)
285-
286281

287282
def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
288283
"""
@@ -298,54 +293,19 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
298293
"""
299294

300295
urlpath: str = regularize_path(urlpath).strip()
301-
path = urlpath
302-
303-
def _split_path(path: str) -> list[str]:
304-
parts = path.split(":")
305-
if pathlib.PureWindowsPath(path).drive:
306-
# Windows absolute path
307-
assert len(parts) >= 2, f"could not split object from windows path {path}"
308-
parts = [parts[0] + ":" + parts[1]] + parts[2:]
309-
return parts
310-
311-
if "://" not in path:
312-
path = "file://" + path
313-
314-
# replace the match of _uri_scheme_chain with "" until there is no match
315-
while _uri_scheme_chain.match(path):
316-
path = _uri_scheme_chain.sub("", path)
317-
318-
if _uri_scheme.match(path):
319-
# if not a local path, attempt to match a URI scheme
320-
if path.startswith("file://"):
321-
parsed_url_path = path[7:]
322-
else:
323-
parsed_url_path = urlparse(path).path
324-
325-
if parsed_url_path.startswith("//"):
326-
parsed_url_path = parsed_url_path[2:]
327-
328-
parts = _split_path(parsed_url_path)
329-
else:
330-
# invalid scheme
331-
scheme = path.split("://")[0]
332-
raise ValueError(
333-
f"Invalid URI scheme: '{scheme}://' in {path}. Available schemes: {', '.join(_schemes)}."
334-
)
335-
336-
if len(parts) == 1:
337-
obj = None
338-
elif len(parts) == 2:
339-
obj = parts[1]
340-
# remove the object from the path (including the colon)
341-
urlpath = urlpath[: -len(obj) - 1]
342-
# clean badly placed slashes
343-
obj = obj.strip().lstrip("/")
344-
while "//" in obj:
345-
obj = obj.replace("//", "/")
346-
else:
347-
raise ValueError(f"could not split object from path {path}")
348-
296+
obj = None
297+
298+
separator = "::"
299+
parts = urlpath.split(separator)
300+
object_regex = re.compile(r"(.+\.root):(.*$)")
301+
for i, part in enumerate(reversed(parts)):
302+
match = object_regex.match(part)
303+
if match:
304+
obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/")
305+
parts[-i - 1] = match.group(1)
306+
break
307+
308+
urlpath = separator.join(parts)
349309
return urlpath, obj
350310

351311

tests/test_0001_source_class.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,13 +148,13 @@ def test_colons_and_ports():
148148
"https://example.com:443",
149149
None,
150150
)
151-
assert uproot._util.file_object_path_split("https://example.com:443/something") == (
152-
"https://example.com:443/something",
151+
assert uproot._util.file_object_path_split("https://example.com:443/file.root") == (
152+
"https://example.com:443/file.root",
153153
None,
154154
)
155155
assert uproot._util.file_object_path_split(
156-
"https://example.com:443/something:else"
157-
) == ("https://example.com:443/something", "else")
156+
"https://example.com:443/file.root:object"
157+
) == ("https://example.com:443/file.root", "object")
158158

159159

160160
@pytest.mark.parametrize("use_threads", [True, False], indirect=True)

tests/test_0692_fsspec_reading.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def test_fsspec_zip(tmp_path):
199199

200200
# open with fsspec
201201
with uproot.open(
202-
f"zip://{filename}::file://{filename_zip}:Events/MET_pt"
202+
f"zip://{filename}:Events/MET_pt::file://{filename_zip}"
203203
) as branch:
204204
data = branch.array(library="np")
205205
assert len(data) == 40

tests/test_0976_path_object_split.py

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -64,24 +64,38 @@
6464
),
6565
),
6666
(
67-
"ssh://user@host:22/path/to/file:object",
67+
"ssh://user@host:22/path/to/file.root:/object//path",
6868
(
69-
"ssh://user@host:22/path/to/file",
70-
"object",
69+
"ssh://user@host:22/path/to/file.root",
70+
"object/path",
7171
),
7272
),
7373
(
74-
"ssh://user@host:50230/path/to/file",
74+
"ssh://user@host:22/path/to/file.root:/object//path:with:colon:in:path/something/",
7575
(
76-
"ssh://user@host:50230/path/to/file",
76+
"ssh://user@host:22/path/to/file.root",
77+
"object/path:with:colon:in:path/something",
78+
),
79+
),
80+
(
81+
"ssh://user@host:50230/path/to/file.root",
82+
(
83+
"ssh://user@host:50230/path/to/file.root",
7784
None,
7885
),
7986
),
8087
(
81-
"s3://bucket/path/to/file:object",
88+
"s3://bucket/path/to/file.root:/dir////object",
89+
(
90+
"s3://bucket/path/to/file.root",
91+
"dir/object",
92+
),
93+
),
94+
(
95+
"s3://bucket/path/to/file.root:",
8296
(
83-
"s3://bucket/path/to/file",
84-
"object",
97+
"s3://bucket/path/to/file.root",
98+
"",
8599
),
86100
),
87101
(
@@ -98,27 +112,56 @@
98112
None,
99113
),
100114
),
115+
# https://github.com/scikit-hep/uproot5/issues/975
101116
(
102-
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
117+
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root:RNT:CollectionTree",
118+
(
119+
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root",
120+
"RNT:CollectionTree",
121+
),
122+
),
123+
(
124+
"zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
103125
(
104126
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
105127
"Events/MET_pt",
106128
),
107129
),
108130
(
109-
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
131+
"simplecache::zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
110132
(
111133
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
112134
"Events/MET_pt",
113135
),
114136
),
115137
(
116-
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip:Events/MET_pt",
138+
r"zip://uproot-issue121.root:Events/MET_pt::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
117139
(
118140
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
119141
"Events/MET_pt",
120142
),
121143
),
144+
(
145+
"zip://uproot-issue121.root:Events/MET_pt::file:///some/weird/path:with:colons/file.root",
146+
(
147+
"zip://uproot-issue121.root::file:///some/weird/path:with:colons/file.root",
148+
"Events/MET_pt",
149+
),
150+
),
151+
(
152+
"/some/weird/path:with:colons/file.root:Events/MET_pt",
153+
(
154+
"/some/weird/path:with:colons/file.root",
155+
"Events/MET_pt",
156+
),
157+
),
158+
(
159+
"/some/weird/path:with:colons/file.root",
160+
(
161+
"/some/weird/path:with:colons/file.root",
162+
None,
163+
),
164+
),
122165
],
123166
)
124167
def test_url_split(input_value, expected_output):
@@ -131,9 +174,12 @@ def test_url_split(input_value, expected_output):
131174
@pytest.mark.parametrize(
132175
"input_value",
133176
[
134-
"local/file.root://Events",
177+
"local/file.root.zip://Events",
178+
"local/file.roo://Events",
179+
"local/file://Events",
135180
],
136181
)
137182
def test_url_split_invalid(input_value):
138-
with pytest.raises(ValueError):
139-
uproot._util.file_object_path_split(input_value)
183+
path, obj = uproot._util.file_object_path_split(input_value)
184+
assert obj is None
185+
assert path == input_value

0 commit comments

Comments
 (0)