Skip to content

Commit 695f14e

Browse files
Fix lazy reference handling for .parquet/.parq files (#1923)
--------- Co-authored-by: Martin Durant <[email protected]>
1 parent e2dd818 commit 695f14e

File tree

3 files changed

+40
-8
lines changed

3 files changed

+40
-8
lines changed

fsspec/implementations/reference.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@
2222
from fsspec.callbacks import DEFAULT_CALLBACK
2323
from fsspec.core import filesystem, open, split_protocol
2424
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
25-
from fsspec.utils import isfilelike, merge_offset_ranges, other_paths
25+
from fsspec.utils import (
26+
isfilelike,
27+
merge_offset_ranges,
28+
other_paths,
29+
)
2630

2731
logger = logging.getLogger("fsspec.reference")
2832

@@ -698,20 +702,22 @@ def __init__(
698702
**(ref_storage_args or target_options or {}), protocol=target_protocol
699703
)
700704
ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
701-
if ref_fs.isfile(fo2):
702-
# text JSON
703-
with fsspec.open(fo, "rb", **dic) as f:
704-
logger.info("Read reference from URL %s", fo)
705-
text = json.load(f)
706-
self._process_references(text, template_overrides)
707-
else:
705+
if ".json" not in fo2 and (
706+
fo.endswith(("parq", "parquet", "/")) or ref_fs.isdir(fo2)
707+
):
708708
# Lazy parquet refs
709709
logger.info("Open lazy reference dict from URL %s", fo)
710710
self.references = LazyReferenceMapper(
711711
fo2,
712712
fs=ref_fs,
713713
cache_size=cache_size,
714714
)
715+
else:
716+
# text JSON
717+
with fsspec.open(fo, "rb", **dic) as f:
718+
logger.info("Read reference from URL %s", fo)
719+
text = json.load(f)
720+
self._process_references(text, template_overrides)
715721
else:
716722
# dictionaries
717723
self._process_references(fo, template_overrides)

fsspec/tests/test_utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from fsspec.utils import (
1010
can_be_local,
1111
common_prefix,
12+
get_file_extension,
1213
get_protocol,
1314
infer_storage_options,
1415
merge_offset_ranges,
@@ -338,6 +339,23 @@ def test_get_protocol(par):
338339
assert get_protocol(url) == outcome
339340

340341

342+
@pytest.mark.parametrize(
343+
["url", "expected"],
344+
(
345+
("https://example.com/q.txt", "txt"),
346+
("https://example.com/foo.parquet", "parquet"),
347+
("https://example.com/foo.parq", "parq"),
348+
("file:///home/user/no_extension", ""),
349+
("/local/path/to/file.json", "json"),
350+
("relative/path/file.yaml", "yaml"),
351+
),
352+
)
353+
def test_get_file_extension(url, expected):
354+
actual = get_file_extension(url)
355+
356+
assert actual == expected
357+
358+
341359
@pytest.mark.parametrize(
342360
"par",
343361
[

fsspec/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,14 @@ def get_protocol(url: str) -> str:
438438
return "file"
439439

440440

441+
def get_file_extension(url: str) -> str:
442+
url = stringify_path(url)
443+
ext_parts = url.rsplit(".", 1)
444+
if len(ext_parts) > 1:
445+
return ext_parts[-1]
446+
return ""
447+
448+
441449
def can_be_local(path: str) -> bool:
442450
"""Can the given URL be used with open_local?"""
443451
from fsspec import get_filesystem_class

0 commit comments

Comments
 (0)