2424# Characters safe for use in filenames and URLs
2525TOKEN_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
2626
27- # Supported remote URL protocols for copy insert
28- REMOTE_PROTOCOLS = ("s3://" , "gs://" , "gcs://" , "az://" , "abfs://" , "http://" , "https://" )
27+ # Supported URL protocols
28+ URL_PROTOCOLS = ("file://" , "s3://" , "gs://" , "gcs://" , "az://" , "abfs://" , "http://" , "https://" )
2929
3030
31- def is_remote_url (path : str ) -> bool :
31+ def is_url (path : str ) -> bool :
3232 """
33- Check if a path is a remote URL.
33+ Check if a path is a URL.
3434
3535 Parameters
3636 ----------
@@ -40,19 +40,57 @@ def is_remote_url(path: str) -> bool:
4040 Returns
4141 -------
4242 bool
43- True if path starts with a supported remote protocol.
43+ True if path starts with a supported URL protocol.
4444 """
45- return path .lower ().startswith (REMOTE_PROTOCOLS )
45+ return path .lower ().startswith (URL_PROTOCOLS )
4646
4747
48- def parse_remote_url ( url : str ) -> tuple [ str , str ] :
48+ def normalize_to_url ( path : str ) -> str :
4949 """
50- Parse a remote URL into protocol and path.
50+ Normalize a path to URL form.
51+
52+ Converts local filesystem paths to file:// URLs. URLs are returned unchanged.
53+
54+ Parameters
55+ ----------
56+ path : str
57+ Path string (local path or URL).
58+
59+ Returns
60+ -------
61+ str
62+ URL form of the path.
63+
64+ Examples
65+ --------
66+ >>> normalize_to_url("/data/file.dat")
67+ 'file:///data/file.dat'
68+ >>> normalize_to_url("s3://bucket/key")
69+ 's3://bucket/key'
70+ >>> normalize_to_url("file:///already/url")
71+ 'file:///already/url'
72+ """
73+ if is_url (path ):
74+ return path
75+ # Convert local path to file:// URL
76+ # Ensure absolute path and proper format
77+ abs_path = str (Path (path ).resolve ())
78+ # Handle Windows paths (C:\...) vs Unix paths (/...)
79+ if abs_path .startswith ("/" ):
80+ return f"file://{ abs_path } "
81+ else :
82+ # Windows: file:///C:/path
83+ return f"file:///{ abs_path .replace (chr (92 ), '/' )} "
84+
85+
86+ def parse_url (url : str ) -> tuple [str , str ]:
87+ """
88+ Parse a URL into protocol and path.
5189
5290 Parameters
5391 ----------
5492 url : str
55- Remote URL (e.g., ``'s3://bucket/path/file.dat'``).
93+ URL (e.g., ``'s3://bucket/path/file.dat'`` or ``'file:///path/to/file '``).
5694
5795 Returns
5896 -------
@@ -63,11 +101,19 @@ def parse_remote_url(url: str) -> tuple[str, str]:
63101 ------
64102 DataJointError
65103 If URL protocol is not supported.
104+
105+ Examples
106+ --------
107+ >>> parse_url("s3://bucket/key/file.dat")
108+ ('s3', 'bucket/key/file.dat')
109+ >>> parse_url("file:///data/file.dat")
110+ ('file', '/data/file.dat')
66111 """
67112 url_lower = url .lower ()
68113
69114 # Map URL schemes to fsspec protocols
70115 protocol_map = {
116+ "file://" : "file" ,
71117 "s3://" : "s3" ,
72118 "gs://" : "gcs" ,
73119 "gcs://" : "gcs" ,
@@ -82,7 +128,7 @@ def parse_remote_url(url: str) -> tuple[str, str]:
82128 path = url [len (prefix ) :]
83129 return protocol , path
84130
85- raise errors .DataJointError (f"Unsupported remote URL protocol: { url } " )
131+ raise errors .DataJointError (f"Unsupported URL protocol: { url } " )
86132
87133
88134def generate_token (length : int = 8 ) -> str :
@@ -356,6 +402,53 @@ def _full_path(self, path: str | PurePosixPath) -> str:
356402 return str (Path (location ) / path )
357403 return path
358404
405+ def get_url (self , path : str | PurePosixPath ) -> str :
406+ """
407+ Get the full URL for a path in storage.
408+
409+ Returns a consistent URL representation for any storage backend,
410+ including file:// URLs for local filesystem.
411+
412+ Parameters
413+ ----------
414+ path : str or PurePosixPath
415+ Relative path within the storage location.
416+
417+ Returns
418+ -------
419+ str
420+ Full URL (e.g., 's3://bucket/path' or 'file:///data/path').
421+
422+ Examples
423+ --------
424+ >>> backend = StorageBackend({"protocol": "file", "location": "/data"})
425+ >>> backend.get_url("schema/table/file.dat")
426+ 'file:///data/schema/table/file.dat'
427+
428+ >>> backend = StorageBackend({"protocol": "s3", "bucket": "mybucket", ...})
429+ >>> backend.get_url("schema/table/file.dat")
430+ 's3://mybucket/schema/table/file.dat'
431+ """
432+ full_path = self ._full_path (path )
433+
434+ if self .protocol == "file" :
435+ # Ensure absolute path for file:// URL
436+ abs_path = str (Path (full_path ).resolve ())
437+ if abs_path .startswith ("/" ):
438+ return f"file://{ abs_path } "
439+ else :
440+ # Windows path
441+ return f"file:///{ abs_path .replace (chr (92 ), '/' )} "
442+ elif self .protocol == "s3" :
443+ return f"s3://{ full_path } "
444+ elif self .protocol == "gcs" :
445+ return f"gs://{ full_path } "
446+ elif self .protocol == "azure" :
447+ return f"az://{ full_path } "
448+ else :
449+ # Fallback: use protocol prefix
450+ return f"{ self .protocol } ://{ full_path } "
451+
359452 def put_file (self , local_path : str | Path , remote_path : str | PurePosixPath , metadata : dict | None = None ) -> None :
360453 """
361454 Upload a file from local filesystem to storage.
@@ -672,7 +765,7 @@ def copy_from_url(self, source_url: str, dest_path: str | PurePosixPath) -> int:
672765 int
673766 Size of copied file in bytes.
674767 """
675- protocol , source_path = parse_remote_url (source_url )
768+ protocol , source_path = parse_url (source_url )
676769 full_dest = self ._full_path (dest_path )
677770
678771 logger .debug (f"copy_from_url: { protocol } ://{ source_path } -> { self .protocol } :{ full_dest } " )
@@ -772,8 +865,8 @@ def source_is_directory(self, source: str) -> bool:
772865 bool
773866 True if source is a directory.
774867 """
775- if is_remote_url (source ):
776- protocol , path = parse_remote_url (source )
868+ if is_url (source ):
869+ protocol , path = parse_url (source )
777870 source_fs = fsspec .filesystem (protocol )
778871 return source_fs .isdir (path )
779872 else :
@@ -793,8 +886,8 @@ def source_exists(self, source: str) -> bool:
793886 bool
794887 True if source exists.
795888 """
796- if is_remote_url (source ):
797- protocol , path = parse_remote_url (source )
889+ if is_url (source ):
890+ protocol , path = parse_url (source )
798891 source_fs = fsspec .filesystem (protocol )
799892 return source_fs .exists (path )
800893 else :
@@ -815,8 +908,8 @@ def get_source_size(self, source: str) -> int | None:
815908 Size in bytes, or None if directory or cannot determine.
816909 """
817910 try :
818- if is_remote_url (source ):
819- protocol , path = parse_remote_url (source )
911+ if is_url (source ):
912+ protocol , path = parse_url (source )
820913 source_fs = fsspec .filesystem (protocol )
821914 if source_fs .isdir (path ):
822915 return None
0 commit comments