Skip to content

Commit 6b6c38b

Browse files
committed
feat: simplify object path split (#1028)
* simplify object path split * add example from #975 * fix tests * add more test cases * test case update * remove scheme unused regex
1 parent ba95806 commit 6b6c38b

File tree

4 files changed

+79
-73
lines changed

4 files changed

+79
-73
lines changed

src/uproot/_util.py

Lines changed: 14 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import itertools
1313
import numbers
1414
import os
15-
import pathlib
1615
import platform
1716
import re
1817
import warnings
@@ -290,14 +289,10 @@ def regularize_path(path):
290289
_windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]")
291290
_windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]")
292291

292+
# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs)
293293
_remote_schemes = ["root", "s3", "http", "https"]
294294
_schemes = list({*_remote_schemes, *fsspec.available_protocols()})
295295

296-
_uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://")
297-
_uri_scheme_chain = re.compile(
298-
"^(" + "|".join([re.escape(x) for x in _schemes]) + ")::"
299-
)
300-
301296

302297
def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
303298
"""
@@ -313,54 +308,19 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
313308
"""
314309

315310
urlpath: str = regularize_path(urlpath).strip()
316-
path = urlpath
317-
318-
def _split_path(path: str) -> list[str]:
319-
parts = path.split(":")
320-
if pathlib.PureWindowsPath(path).drive:
321-
# Windows absolute path
322-
assert len(parts) >= 2, f"could not split object from windows path {path}"
323-
parts = [parts[0] + ":" + parts[1]] + parts[2:]
324-
return parts
325-
326-
if "://" not in path:
327-
path = "file://" + path
328-
329-
# replace the match of _uri_scheme_chain with "" until there is no match
330-
while _uri_scheme_chain.match(path):
331-
path = _uri_scheme_chain.sub("", path)
332-
333-
if _uri_scheme.match(path):
334-
# if not a local path, attempt to match a URI scheme
335-
if path.startswith("file://"):
336-
parsed_url_path = path[7:]
337-
else:
338-
parsed_url_path = urlparse(path).path
339-
340-
if parsed_url_path.startswith("//"):
341-
parsed_url_path = parsed_url_path[2:]
342-
343-
parts = _split_path(parsed_url_path)
344-
else:
345-
# invalid scheme
346-
scheme = path.split("://")[0]
347-
raise ValueError(
348-
f"Invalid URI scheme: '{scheme}://' in {path}. Available schemes: {', '.join(_schemes)}."
349-
)
350-
351-
if len(parts) == 1:
352-
obj = None
353-
elif len(parts) == 2:
354-
obj = parts[1]
355-
# remove the object from the path (including the colon)
356-
urlpath = urlpath[: -len(obj) - 1]
357-
# clean badly placed slashes
358-
obj = obj.strip().lstrip("/")
359-
while "//" in obj:
360-
obj = obj.replace("//", "/")
361-
else:
362-
raise ValueError(f"could not split object from path {path}")
363-
311+
obj = None
312+
313+
separator = "::"
314+
parts = urlpath.split(separator)
315+
object_regex = re.compile(r"(.+\.root):(.*$)")
316+
for i, part in enumerate(reversed(parts)):
317+
match = object_regex.match(part)
318+
if match:
319+
obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/")
320+
parts[-i - 1] = match.group(1)
321+
break
322+
323+
urlpath = separator.join(parts)
364324
return urlpath, obj
365325

366326

tests/test_0001_source_class.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,13 +148,13 @@ def test_colons_and_ports():
148148
"https://example.com:443",
149149
None,
150150
)
151-
assert uproot._util.file_object_path_split("https://example.com:443/something") == (
152-
"https://example.com:443/something",
151+
assert uproot._util.file_object_path_split("https://example.com:443/file.root") == (
152+
"https://example.com:443/file.root",
153153
None,
154154
)
155155
assert uproot._util.file_object_path_split(
156-
"https://example.com:443/something:else"
157-
) == ("https://example.com:443/something", "else")
156+
"https://example.com:443/file.root:object"
157+
) == ("https://example.com:443/file.root", "object")
158158

159159

160160
@pytest.mark.parametrize("use_threads", [True, False], indirect=True)

tests/test_0692_fsspec_reading.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def test_fsspec_zip(tmp_path):
208208

209209
# open with fsspec
210210
with uproot.open(
211-
f"zip://{filename}::file://{filename_zip}:Events/MET_pt"
211+
f"zip://{filename}:Events/MET_pt::file://{filename_zip}"
212212
) as branch:
213213
data = branch.array(library="np")
214214
assert len(data) == 40

tests/test_0976_path_object_split.py

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -64,24 +64,38 @@
6464
),
6565
),
6666
(
67-
"ssh://user@host:22/path/to/file:object",
67+
"ssh://user@host:22/path/to/file.root:/object//path",
6868
(
69-
"ssh://user@host:22/path/to/file",
70-
"object",
69+
"ssh://user@host:22/path/to/file.root",
70+
"object/path",
7171
),
7272
),
7373
(
74-
"ssh://user@host:50230/path/to/file",
74+
"ssh://user@host:22/path/to/file.root:/object//path:with:colon:in:path/something/",
7575
(
76-
"ssh://user@host:50230/path/to/file",
76+
"ssh://user@host:22/path/to/file.root",
77+
"object/path:with:colon:in:path/something",
78+
),
79+
),
80+
(
81+
"ssh://user@host:50230/path/to/file.root",
82+
(
83+
"ssh://user@host:50230/path/to/file.root",
7784
None,
7885
),
7986
),
8087
(
81-
"s3://bucket/path/to/file:object",
88+
"s3://bucket/path/to/file.root:/dir////object",
89+
(
90+
"s3://bucket/path/to/file.root",
91+
"dir/object",
92+
),
93+
),
94+
(
95+
"s3://bucket/path/to/file.root:",
8296
(
83-
"s3://bucket/path/to/file",
84-
"object",
97+
"s3://bucket/path/to/file.root",
98+
"",
8599
),
86100
),
87101
(
@@ -98,27 +112,56 @@
98112
None,
99113
),
100114
),
115+
# https://github.com/scikit-hep/uproot5/issues/975
101116
(
102-
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
117+
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root:RNT:CollectionTree",
118+
(
119+
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root",
120+
"RNT:CollectionTree",
121+
),
122+
),
123+
(
124+
"zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
103125
(
104126
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
105127
"Events/MET_pt",
106128
),
107129
),
108130
(
109-
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
131+
"simplecache::zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
110132
(
111133
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
112134
"Events/MET_pt",
113135
),
114136
),
115137
(
116-
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip:Events/MET_pt",
138+
r"zip://uproot-issue121.root:Events/MET_pt::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
117139
(
118140
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
119141
"Events/MET_pt",
120142
),
121143
),
144+
(
145+
"zip://uproot-issue121.root:Events/MET_pt::file:///some/weird/path:with:colons/file.root",
146+
(
147+
"zip://uproot-issue121.root::file:///some/weird/path:with:colons/file.root",
148+
"Events/MET_pt",
149+
),
150+
),
151+
(
152+
"/some/weird/path:with:colons/file.root:Events/MET_pt",
153+
(
154+
"/some/weird/path:with:colons/file.root",
155+
"Events/MET_pt",
156+
),
157+
),
158+
(
159+
"/some/weird/path:with:colons/file.root",
160+
(
161+
"/some/weird/path:with:colons/file.root",
162+
None,
163+
),
164+
),
122165
],
123166
)
124167
def test_url_split(input_value, expected_output):
@@ -131,9 +174,12 @@ def test_url_split(input_value, expected_output):
131174
@pytest.mark.parametrize(
132175
"input_value",
133176
[
134-
"local/file.root://Events",
177+
"local/file.root.zip://Events",
178+
"local/file.roo://Events",
179+
"local/file://Events",
135180
],
136181
)
137182
def test_url_split_invalid(input_value):
138-
with pytest.raises(ValueError):
139-
uproot._util.file_object_path_split(input_value)
183+
path, obj = uproot._util.file_object_path_split(input_value)
184+
assert obj is None
185+
assert path == input_value

0 commit comments

Comments
 (0)