Skip to content

Commit abebba9

Browse files
feat: Add consistent URL representation for all storage paths (#1326)
Implements unified URL handling for all storage backends including local files: - Add URL_PROTOCOLS tuple including file:// - Add is_url() to check if path is a URL - Add normalize_to_url() to convert local paths to file:// URLs - Add parse_url() to parse any URL into protocol and path - Add StorageBackend.get_url() to return full URLs for any backend - Add comprehensive unit tests for URL functions This enables consistent internal representation across all storage types, aligning with fsspec's unified approach to filesystems. Closes #1326 Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 6c29792 commit abebba9

File tree

3 files changed

+231
-45
lines changed

3 files changed

+231
-45
lines changed

src/datajoint/objectref.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -128,32 +128,6 @@ def to_json(self) -> dict:
128128
data["item_count"] = self.item_count
129129
return data
130130

131-
def to_dict(self) -> dict:
132-
"""
133-
Return the raw JSON metadata as a dictionary.
134-
135-
This is useful for inspecting the stored metadata without triggering
136-
any storage backend operations. The returned dict matches the JSON
137-
structure stored in the database.
138-
139-
Returns
140-
-------
141-
dict
142-
Dict containing the object metadata:
143-
144-
- path: Relative storage path within the store
145-
- url: Full URI (e.g., 's3://bucket/path') (optional)
146-
- store: Store name (optional, None for default store)
147-
- size: File/folder size in bytes (or None)
148-
- hash: Content hash (or None)
149-
- ext: File extension (or None)
150-
- is_dir: True if folder
151-
- timestamp: Upload timestamp
152-
- mime_type: MIME type (files only, optional)
153-
- item_count: Number of files (folders only, optional)
154-
"""
155-
return self.to_json()
156-
157131
def _ensure_backend(self):
158132
"""Ensure storage backend is available for I/O operations."""
159133
if self._backend is None:

src/datajoint/storage.py

Lines changed: 110 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424
# Characters safe for use in filenames and URLs
2525
TOKEN_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
2626

27-
# Supported remote URL protocols for copy insert
28-
REMOTE_PROTOCOLS = ("s3://", "gs://", "gcs://", "az://", "abfs://", "http://", "https://")
27+
# Supported URL protocols
28+
URL_PROTOCOLS = ("file://", "s3://", "gs://", "gcs://", "az://", "abfs://", "http://", "https://")
2929

3030

31-
def is_remote_url(path: str) -> bool:
31+
def is_url(path: str) -> bool:
3232
"""
33-
Check if a path is a remote URL.
33+
Check if a path is a URL.
3434
3535
Parameters
3636
----------
@@ -40,21 +40,57 @@ def is_remote_url(path: str) -> bool:
4040
Returns
4141
-------
4242
bool
43-
True if path starts with a supported remote protocol.
43+
True if path starts with a supported URL protocol.
4444
"""
45-
if not isinstance(path, str):
46-
return False
47-
return path.lower().startswith(REMOTE_PROTOCOLS)
45+
return path.lower().startswith(URL_PROTOCOLS)
4846

4947

50-
def parse_remote_url(url: str) -> tuple[str, str]:
48+
def normalize_to_url(path: str) -> str:
5149
"""
52-
Parse a remote URL into protocol and path.
50+
Normalize a path to URL form.
51+
52+
Converts local filesystem paths to file:// URLs. URLs are returned unchanged.
53+
54+
Parameters
55+
----------
56+
path : str
57+
Path string (local path or URL).
58+
59+
Returns
60+
-------
61+
str
62+
URL form of the path.
63+
64+
Examples
65+
--------
66+
>>> normalize_to_url("/data/file.dat")
67+
'file:///data/file.dat'
68+
>>> normalize_to_url("s3://bucket/key")
69+
's3://bucket/key'
70+
>>> normalize_to_url("file:///already/url")
71+
'file:///already/url'
72+
"""
73+
if is_url(path):
74+
return path
75+
# Convert local path to file:// URL
76+
# Ensure absolute path and proper format
77+
abs_path = str(Path(path).resolve())
78+
# Handle Windows paths (C:\...) vs Unix paths (/...)
79+
if abs_path.startswith("/"):
80+
return f"file://{abs_path}"
81+
else:
82+
# Windows: file:///C:/path
83+
return f"file:///{abs_path.replace(chr(92), '/')}"
84+
85+
86+
def parse_url(url: str) -> tuple[str, str]:
87+
"""
88+
Parse a URL into protocol and path.
5389
5490
Parameters
5591
----------
5692
url : str
57-
Remote URL (e.g., ``'s3://bucket/path/file.dat'``).
93+
URL (e.g., ``'s3://bucket/path/file.dat'`` or ``'file:///path/to/file'``).
5894
5995
Returns
6096
-------
@@ -65,11 +101,19 @@ def parse_remote_url(url: str) -> tuple[str, str]:
65101
------
66102
DataJointError
67103
If URL protocol is not supported.
104+
105+
Examples
106+
--------
107+
>>> parse_url("s3://bucket/key/file.dat")
108+
('s3', 'bucket/key/file.dat')
109+
>>> parse_url("file:///data/file.dat")
110+
('file', '/data/file.dat')
68111
"""
69112
url_lower = url.lower()
70113

71114
# Map URL schemes to fsspec protocols
72115
protocol_map = {
116+
"file://": "file",
73117
"s3://": "s3",
74118
"gs://": "gcs",
75119
"gcs://": "gcs",
@@ -84,7 +128,7 @@ def parse_remote_url(url: str) -> tuple[str, str]:
84128
path = url[len(prefix) :]
85129
return protocol, path
86130

87-
raise errors.DataJointError(f"Unsupported remote URL protocol: {url}")
131+
raise errors.DataJointError(f"Unsupported URL protocol: {url}")
88132

89133

90134
def generate_token(length: int = 8) -> str:
@@ -358,6 +402,53 @@ def _full_path(self, path: str | PurePosixPath) -> str:
358402
return str(Path(location) / path)
359403
return path
360404

405+
def get_url(self, path: str | PurePosixPath) -> str:
406+
"""
407+
Get the full URL for a path in storage.
408+
409+
Returns a consistent URL representation for any storage backend,
410+
including file:// URLs for local filesystem.
411+
412+
Parameters
413+
----------
414+
path : str or PurePosixPath
415+
Relative path within the storage location.
416+
417+
Returns
418+
-------
419+
str
420+
Full URL (e.g., 's3://bucket/path' or 'file:///data/path').
421+
422+
Examples
423+
--------
424+
>>> backend = StorageBackend({"protocol": "file", "location": "/data"})
425+
>>> backend.get_url("schema/table/file.dat")
426+
'file:///data/schema/table/file.dat'
427+
428+
>>> backend = StorageBackend({"protocol": "s3", "bucket": "mybucket", ...})
429+
>>> backend.get_url("schema/table/file.dat")
430+
's3://mybucket/schema/table/file.dat'
431+
"""
432+
full_path = self._full_path(path)
433+
434+
if self.protocol == "file":
435+
# Ensure absolute path for file:// URL
436+
abs_path = str(Path(full_path).resolve())
437+
if abs_path.startswith("/"):
438+
return f"file://{abs_path}"
439+
else:
440+
# Windows path
441+
return f"file:///{abs_path.replace(chr(92), '/')}"
442+
elif self.protocol == "s3":
443+
return f"s3://{full_path}"
444+
elif self.protocol == "gcs":
445+
return f"gs://{full_path}"
446+
elif self.protocol == "azure":
447+
return f"az://{full_path}"
448+
else:
449+
# Fallback: use protocol prefix
450+
return f"{self.protocol}://{full_path}"
451+
361452
def put_file(self, local_path: str | Path, remote_path: str | PurePosixPath, metadata: dict | None = None) -> None:
362453
"""
363454
Upload a file from local filesystem to storage.
@@ -674,7 +765,7 @@ def copy_from_url(self, source_url: str, dest_path: str | PurePosixPath) -> int:
674765
int
675766
Size of copied file in bytes.
676767
"""
677-
protocol, source_path = parse_remote_url(source_url)
768+
protocol, source_path = parse_url(source_url)
678769
full_dest = self._full_path(dest_path)
679770

680771
logger.debug(f"copy_from_url: {protocol}://{source_path} -> {self.protocol}:{full_dest}")
@@ -774,8 +865,8 @@ def source_is_directory(self, source: str) -> bool:
774865
bool
775866
True if source is a directory.
776867
"""
777-
if is_remote_url(source):
778-
protocol, path = parse_remote_url(source)
868+
if is_url(source):
869+
protocol, path = parse_url(source)
779870
source_fs = fsspec.filesystem(protocol)
780871
return source_fs.isdir(path)
781872
else:
@@ -795,8 +886,8 @@ def source_exists(self, source: str) -> bool:
795886
bool
796887
True if source exists.
797888
"""
798-
if is_remote_url(source):
799-
protocol, path = parse_remote_url(source)
889+
if is_url(source):
890+
protocol, path = parse_url(source)
800891
source_fs = fsspec.filesystem(protocol)
801892
return source_fs.exists(path)
802893
else:
@@ -817,8 +908,8 @@ def get_source_size(self, source: str) -> int | None:
817908
Size in bytes, or None if directory or cannot determine.
818909
"""
819910
try:
820-
if is_remote_url(source):
821-
protocol, path = parse_remote_url(source)
911+
if is_url(source):
912+
protocol, path = parse_url(source)
822913
source_fs = fsspec.filesystem(protocol)
823914
if source_fs.isdir(path):
824915
return None

tests/unit/test_storage_urls.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""Unit tests for storage URL functions."""
2+
3+
import pytest
4+
5+
from datajoint.storage import (
6+
URL_PROTOCOLS,
7+
is_url,
8+
normalize_to_url,
9+
parse_url,
10+
)
11+
12+
13+
class TestURLProtocols:
14+
"""Test URL protocol constants."""
15+
16+
def test_url_protocols_includes_file(self):
17+
"""URL_PROTOCOLS should include file://."""
18+
assert "file://" in URL_PROTOCOLS
19+
20+
def test_url_protocols_includes_s3(self):
21+
"""URL_PROTOCOLS should include s3://."""
22+
assert "s3://" in URL_PROTOCOLS
23+
24+
def test_url_protocols_includes_cloud_providers(self):
25+
"""URL_PROTOCOLS should include major cloud providers."""
26+
assert "gs://" in URL_PROTOCOLS
27+
assert "az://" in URL_PROTOCOLS
28+
29+
30+
class TestIsUrl:
31+
"""Test is_url function."""
32+
33+
def test_s3_url(self):
34+
assert is_url("s3://bucket/key")
35+
36+
def test_gs_url(self):
37+
assert is_url("gs://bucket/key")
38+
39+
def test_file_url(self):
40+
assert is_url("file:///path/to/file")
41+
42+
def test_http_url(self):
43+
assert is_url("http://example.com/file")
44+
45+
def test_https_url(self):
46+
assert is_url("https://example.com/file")
47+
48+
def test_local_path_not_url(self):
49+
assert not is_url("/path/to/file")
50+
51+
def test_relative_path_not_url(self):
52+
assert not is_url("relative/path/file.dat")
53+
54+
def test_case_insensitive(self):
55+
assert is_url("S3://bucket/key")
56+
assert is_url("FILE:///path")
57+
58+
59+
class TestNormalizeToUrl:
60+
"""Test normalize_to_url function."""
61+
62+
def test_local_path_to_file_url(self):
63+
url = normalize_to_url("/data/file.dat")
64+
assert url.startswith("file://")
65+
assert "data/file.dat" in url
66+
67+
def test_s3_url_unchanged(self):
68+
url = "s3://bucket/key/file.dat"
69+
assert normalize_to_url(url) == url
70+
71+
def test_file_url_unchanged(self):
72+
url = "file:///data/file.dat"
73+
assert normalize_to_url(url) == url
74+
75+
def test_relative_path_becomes_absolute(self):
76+
url = normalize_to_url("relative/path.dat")
77+
assert url.startswith("file://")
78+
# Should be absolute (contain full path)
79+
assert "/" in url[7:] # After "file://"
80+
81+
82+
class TestParseUrl:
83+
"""Test parse_url function."""
84+
85+
def test_parse_s3(self):
86+
protocol, path = parse_url("s3://bucket/key/file.dat")
87+
assert protocol == "s3"
88+
assert path == "bucket/key/file.dat"
89+
90+
def test_parse_gs(self):
91+
protocol, path = parse_url("gs://bucket/key")
92+
assert protocol == "gcs"
93+
assert path == "bucket/key"
94+
95+
def test_parse_gcs(self):
96+
protocol, path = parse_url("gcs://bucket/key")
97+
assert protocol == "gcs"
98+
assert path == "bucket/key"
99+
100+
def test_parse_file(self):
101+
protocol, path = parse_url("file:///data/file.dat")
102+
assert protocol == "file"
103+
assert path == "/data/file.dat"
104+
105+
def test_parse_http(self):
106+
protocol, path = parse_url("http://example.com/file")
107+
assert protocol == "http"
108+
assert path == "example.com/file"
109+
110+
def test_parse_https(self):
111+
protocol, path = parse_url("https://example.com/file")
112+
assert protocol == "https"
113+
assert path == "example.com/file"
114+
115+
def test_unsupported_protocol_raises(self):
116+
with pytest.raises(Exception, match="Unsupported URL protocol"):
117+
parse_url("ftp://example.com/file")
118+
119+
def test_local_path_raises(self):
120+
with pytest.raises(Exception, match="Unsupported URL protocol"):
121+
parse_url("/local/path")

0 commit comments

Comments
 (0)