fixes to parquet and known cache

martindurant · martindurant · commit 0543fc6efc73 · 2026-01-28T16:13:01.000-05:00
diff --git a/fsspec/caching.py b/fsspec/caching.py
@@ -668,7 +668,6 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
         if stop is None:
             stop = self.size
         self.total_requested_bytes += stop - start
-
         out = b""
         started = False
         loc_old = 0
@@ -699,11 +698,13 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
             elif loc0 <= stop <= loc1:
                 # end block
                 self.hit_count += 1
-                return out + self.data[(loc0, loc1)][: stop - loc0]
+                out = out + self.data[(loc0, loc1)][: stop - loc0]
+                return out
             loc_old = loc1
         self.miss_count += 1
         if started and not self.strict:
-            return out + b"\x00" * (stop - loc_old)
+            out = out + b"\x00" * (stop - loc_old)
+            return out
         raise ValueError
 
 
diff --git a/fsspec/parquet.py b/fsspec/parquet.py
@@ -1,7 +1,6 @@
 import io
 import json
 import warnings
-from typing import Literal
 
 import fsspec
 
@@ -25,7 +24,6 @@ def _fetch_range(self, start, end):
 
 def open_parquet_files(
     path: list[str],
-    mode: Literal["rb"] = "rb",
     fs: None | fsspec.AbstractFileSystem = None,
     metadata=None,
     columns: None | list[str] = None,
@@ -54,8 +52,6 @@ def open_parquet_files(
     ----------
     path: str
         Target file path.
-    mode: str, optional
-        Mode option to be passed through to `fs.open`. Default is "rb".
     metadata: Any, optional
         Parquet metadata object. Object type must be supported
         by the backend parquet engine. For now, only the "fastparquet"
@@ -150,16 +146,16 @@ def open_parquet_files(
         AlreadyBufferedFile(
             fs=None,
             path=fn,
-            mode=mode,
+            mode="rb",
             cache_type="parts",
             cache_options={
                 **options,
-                "data": data.get(fn, {}),
+                "data": ranges,
             },
-            size=max(_[1] for _ in data.get(fn, {})),
+            size=max(_[1] for _ in ranges),
             **kwargs,
         )
-        for fn in data
+        for fn, ranges in data.items()
     ]
 
 
@@ -197,7 +193,7 @@ def _get_parquet_byte_ranges(
     if isinstance(engine, str):
         engine = _set_engine(engine)
 
-    # Pass to specialized function if metadata is defined
+    # Pass to a specialized function if metadata is defined
     if metadata is not None:
         # Use the provided parquet metadata object
         # to avoid transferring/parsing footer metadata
@@ -212,63 +208,54 @@ def _get_parquet_byte_ranges(
             filters=filters,
         )
 
-    # Get file sizes asynchronously
-    file_sizes = fs.sizes(paths)
-
     # Populate global paths, starts, & ends
-    result = {}
-    data_paths = []
-    data_starts = []
-    data_ends = []
-    add_header_magic = True
     if columns is None and row_groups is None and filters is None:
         # We are NOT selecting specific columns or row-groups.
         #
         # We can avoid sampling the footers, and just transfer
         # all file data with cat_ranges
-        for i, path in enumerate(paths):
-            result[path] = {}
-            data_paths.append(path)
-            data_starts.append(0)
-            data_ends.append(file_sizes[i])
-        add_header_magic = False  # "Magic" should already be included
+        result = {path: {(0, len(data)): data} for path, data in fs.cat(paths).items()}
     else:
         # We ARE selecting specific columns or row-groups.
         #
+        # Get file sizes asynchronously
+        file_sizes = fs.sizes(paths)
+        data_paths = []
+        data_starts = []
+        data_ends = []
         # Gather file footers.
         # We just take the last `footer_sample_size` bytes of each
         # file (or the entire file if it is smaller than that)
-        footer_starts = []
-        footer_ends = []
-        for i, path in enumerate(paths):
-            footer_ends.append(file_sizes[i])
-            sample_size = max(0, file_sizes[i] - footer_sample_size)
-            footer_starts.append(sample_size)
-        footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
+        footer_starts = [
+            max(0, file_size - footer_sample_size) for file_size in file_sizes
+        ]
+        footer_samples = fs.cat_ranges(paths, footer_starts, file_sizes)
 
         # Check our footer samples and re-sample if necessary.
-        missing_footer_starts = footer_starts.copy()
-        large_footer = 0
+        large_footer = []
         for i, path in enumerate(paths):
             footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
             real_footer_start = file_sizes[i] - (footer_size + 8)
             if real_footer_start < footer_starts[i]:
-                missing_footer_starts[i] = real_footer_start
-                large_footer = max(large_footer, (footer_size + 8))
+                large_footer.append((i, real_footer_start))
         if large_footer:
             warnings.warn(
                 f"Not enough data was used to sample the parquet footer. "
                 f"Try setting footer_sample_size >= {large_footer}."
             )
-            for i, block in enumerate(
-                fs.cat_ranges(
-                    paths,
-                    missing_footer_starts,
-                    footer_starts,
-                )
-            ):
+            path0 = [paths[i] for i, _ in large_footer]
+            starts = [_[1] for _ in large_footer]
+            ends = [file_sizes[i] - footer_sample_size for i, _ in large_footer]
+            data = fs.cat_ranges(path0, starts, ends)
+            for i, (path, start, block) in enumerate(zip(path0, starts, data)):
                 footer_samples[i] = block + footer_samples[i]
-                footer_starts[i] = missing_footer_starts[i]
+                footer_starts[i] = start
+        result = {
+            path: {(start, size): data}
+            for path, start, size, data in zip(
+                paths, footer_starts, file_sizes, footer_samples
+            )
+        }
 
         # Calculate required byte ranges for each path
         for i, path in enumerate(paths):
@@ -284,9 +271,6 @@ def _get_parquet_byte_ranges(
             data_paths += [path] * len(path_data_starts)
             data_starts += path_data_starts
             data_ends += path_data_ends
-            result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
-                footer_samples[i]
-            )
 
         # Merge adjacent offset ranges
         data_paths, data_starts, data_ends = merge_offset_ranges(
@@ -295,19 +279,14 @@ def _get_parquet_byte_ranges(
             data_ends,
             max_gap=max_gap,
             max_block=max_block,
-            sort=False,  # Should already be sorted
+            sort=True,
         )
 
-        # Start by populating `result` with footer samples
-        for i, path in enumerate(paths):
-            result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
+        # Transfer the data byte-ranges into local memory
+        _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
 
-    # Transfer the data byte-ranges into local memory
-    _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
-
-    # Add b"PAR1" to header if necessary
-    if add_header_magic:
-        _add_header_magic(result)
+    # Add b"PAR1" to headers
+    _add_header_magic(result)
 
     return result
 
@@ -362,7 +341,7 @@ def _transfer_ranges(fs, blocks, paths, starts, ends):
 
 def _add_header_magic(data):
     # Add b"PAR1" to file headers
-    for path in list(data.keys()):
+    for path in list(data):
         add_magic = True
         for k in data[path]:
             if k[0] == 0 and k[1] >= 4:
@@ -419,9 +398,6 @@ def __init__(self):
 
         self.fp = fp
 
-    def _row_group_filename(self, row_group, pf):
-        return pf.row_group_filename(row_group)
-
     def _parquet_byte_ranges(
         self,
         columns,
@@ -476,7 +452,7 @@ def _parquet_byte_ranges(
             # specific row-groups
             if row_group_indices is None or r in row_group_indices:
                 # Find the target parquet-file path for `row_group`
-                fn = self._row_group_filename(row_group, pf)
+                fn = pf.row_group_filename(row_group)
 
                 for column in row_group.columns:
                     name = column.meta_data.path_in_schema
@@ -515,9 +491,6 @@ def __init__(self):
 
         self.pq = pq
 
-    def _row_group_filename(self, row_group, metadata):
-        raise NotImplementedError
-
     def _parquet_byte_ranges(
         self,
         columns,
@@ -530,6 +503,7 @@ def _parquet_byte_ranges(
         if metadata is not None:
             raise ValueError("metadata input not supported for PyarrowEngine")
         if filters:
+            # there must be a way!
             raise NotImplementedError
 
         data_starts, data_ends = [], []
@@ -555,7 +529,7 @@ def _parquet_byte_ranges(
                 column_set |= set(md_index)
         if column_set is not None:
             column_set = [
-                _ if isinstance(_, list) else _.split(".") for _ in column_set
+                _[:1] if isinstance(_, list) else _.split(".")[:1] for _ in column_set
             ]
 
         # Loop through column chunks to add required byte ranges
@@ -580,15 +554,15 @@ def _parquet_byte_ranges(
                             ]
                             if _ is not None
                         )
-                        if footer_start is None or file_offset0 < footer_start:
+                        if file_offset0 < footer_start:
                             data_starts.append(file_offset0)
                             data_ends.append(
                                 min(
                                     meta["total_compressed_size"] + file_offset0,
-                                    footer_start
-                                    or (meta["total_compressed_size"] + file_offset0),
+                                    footer_start,
                                 )
                             )
+
         data_starts.append(footer_start)
         data_ends.append(footer_start + len(footer))
         return data_starts, data_ends
diff --git a/fsspec/tests/test_parquet.py b/fsspec/tests/test_parquet.py
@@ -207,17 +207,31 @@ def test_multiple(tmpdir):
     assert expect.equals(result)
 
 
-@pytest.mark.parametrize("n", [100, 10_000, 1_000_000])
+@pytest.mark.parametrize("n", [1_000, 1_000_000])
 def test_nested(n, tmpdir, engine):
     path = os.path.join(str(tmpdir), "test.parquet")
     pa = pytest.importorskip("pyarrow")
     flat = pa.array([random.random() for _ in range(n)])
-    a = random.random()
-    b = random.random()
-    nested = pa.array([{"a": a, "b": b} for _ in range(n)])
+    nested = pa.array([{"a": random.random(), "b": random.random()} for _ in range(n)])
+    data = [float(_[0]) for _ in nested]
     table = pa.table({"flat": flat, "nested": nested})
     pq.write_table(table, path)
     with open_parquet_file(path, columns=["nested.a"], engine=engine) as fh:
         col = pd.read_parquet(fh, engine=engine, columns=["nested.a"])
     name = "a" if engine == "pyarrow" else "nested.a"
-    assert (col[name] == a).all()
+    assert (col[name] == data).all()
+
+
+@PYARROW_MARK
+def test_nested_arrow_nodict(tmpdir):
+    pa = pytest.importorskip("pyarrow")
+    n = 1_000_000
+    path = os.path.join(str(tmpdir), "test.parquet")
+    flat = pa.array([random.random() for _ in range(n)])
+    nested = pa.array([{"a": random.random(), "b": random.random()} for _ in range(n)])
+    data = [float(_[0]) for _ in nested]
+    table = pa.table({"flat": flat, "nested": nested})
+    pq.write_table(table, path, use_dictionary=False)
+    with open_parquet_file(path, columns=["nested"], engine="pyarrow") as fh:
+        col = pd.read_parquet(fh, engine="pyarrow", columns=["nested.a"])
+    assert (col["a"] == data).all()
diff --git a/fsspec/utils.py b/fsspec/utils.py
@@ -566,6 +566,16 @@ def merge_offset_ranges(
                 )
             )
         )
+    remove = []
+    for i, (path, start, end) in enumerate(zip(paths, starts, ends)):
+        if any(
+            p == path and start >= s and end <= e and i != i2
+            for i2, (p, s, e) in enumerate(zip(paths, starts, ends))
+        ):
+            remove.append(i)
+    paths = [p for i, p in enumerate(paths) if i not in remove]
+    starts = [s for i, s in enumerate(starts) if i not in remove]
+    ends = [e for i, e in enumerate(ends) if i not in remove]
 
     if paths:
         # Loop through the coupled `paths`, `starts`, and
@@ -587,7 +597,7 @@ def merge_offset_ranges(
                 new_starts.append(starts[i])
                 new_ends.append(ends[i])
             else:
-                # Merge with previous block by updating the
+                # Merge with the previous block by updating the
                 # last element of `ends`
                 new_ends[-1] = ends[i]
         return new_paths, new_starts, new_ends