Feat: Adaptive request splitting in EagerStoreReader (#26)

maxrjones · web-flow · commit 3c30d8c0be16 · 2026-01-23T23:02:48.000-05:00
diff --git a/src/obspec_utils/obspec.py b/src/obspec_utils/obspec.py
@@ -361,10 +361,14 @@ class EagerStoreReader:
     subsequent reads from the in-memory cache. Useful for files that will be
     read multiple times or when seeking is frequent.
 
-    When `chunk_size` is provided, the file is fetched using parallel chunked
-    requests via `get_ranges()`, which can significantly reduce load time for
-    large files by maximizing parallelism. If the store supports the `Head`
-    protocol, the file size will be determined automatically via a HEAD request.
+    By default, the file is fetched using parallel range requests via
+    `get_ranges()`, which can significantly improve load time for large files.
+    The defaults (12 MB request size, max 18 concurrent requests) are tuned for
+    cloud storage. If the store supports the `Head` protocol, the file size
+    will be determined automatically via a HEAD request.
+
+    The parallel fetching strategy is based on Icechunk's approach:
+    https://github.com/earth-mover/icechunk/blob/main/icechunk/src/storage/mod.rs
 
     Works with any ReadableStore protocol implementation.
 
@@ -377,8 +381,8 @@ class EagerStoreReader:
     - **Repeated random access**: After the initial load, any byte is accessible
       with no network latency.
     - **Small to medium files**: Files that fit comfortably in memory.
-    - **Parallel initial fetch**: With `chunk_size` set, the initial load uses
-      parallel requests for faster download.
+    - **Parallel initial fetch**: The default settings use parallel requests
+      for faster download on cloud storage.
 
     Consider alternatives when:
 
@@ -397,8 +401,9 @@ def __init__(
         self,
         store: ReadableStore,
         path: str,
-        chunk_size: int | None = None,
+        request_size: int = 12 * 1024 * 1024,
         file_size: int | None = None,
+        max_concurrent_requests: int = 18,
     ) -> None:
         """
         Create an eager reader that loads the entire file into memory.
@@ -411,54 +416,62 @@ def __init__(
             Any object implementing the [ReadableStore][obspec_utils.obspec.ReadableStore] protocol.
         path
             The path to the file within the store.
-        chunk_size
-            If provided, fetch the file using parallel requests of this size.
-            The file will be divided into chunks and fetched using `get_ranges()`.
-            If the store supports the `Head` protocol, the file size will be
-            determined automatically. Otherwise, `file_size` must be provided
-            for chunked fetching to work. If None (default), fetch with a single
-            `get()` request.
+        request_size
+            Target size for each parallel range request in bytes. Default is 12 MB,
+            tuned for cloud storage throughput. The file will be divided into
+            parts of this size and fetched using `get_ranges()`.
         file_size
-            File size in bytes. If not provided and `chunk_size` is set, the
-            reader will attempt to get the size via `store.head()` if the store
-            supports the `Head` protocol.
+            File size in bytes. If not provided, the reader will attempt to get
+            the size via `store.head()` if the store supports the `Head` protocol.
+            If the size cannot be determined, falls back to a single `get()` request.
+        max_concurrent_requests
+            Maximum number of parallel range requests. Default is 18. If the file
+            would require more requests than this, request sizes are increased to
+            fit within this limit.
         """
-        if chunk_size is None:
-            # Single request - fetch entire file
+        # Determine file size if not provided
+        if file_size is None:
+            if hasattr(store, "head") and callable(store.head):
+                file_size = store.head(path)["size"]
+            else:
+                # Fall back to single request if we can't determine size
+                result = store.get(path)
+                data = bytes(result.buffer())
+                self._buffer = io.BytesIO(data)
+                return
+
+        # Handle empty files
+        if file_size == 0:
+            self._buffer = io.BytesIO(b"")
+            return
+
+        # Calculate number of requests needed
+        num_requests = (file_size + request_size - 1) // request_size
+
+        # Cap at max_concurrent_requests by increasing request size
+        if num_requests > max_concurrent_requests:
+            num_requests = max_concurrent_requests
+            request_size = (file_size + num_requests - 1) // num_requests
+
+        # Skip concurrency overhead for single request
+        if num_requests == 1:
             result = store.get(path)
             data = bytes(result.buffer())
         else:
-            # Determine file size if not provided
-            if file_size is None:
-                if hasattr(store, "head") and callable(store.head):
-                    file_size = store.head(path)["size"]
-                else:
-                    # Fall back to single request if we can't determine size
-                    result = store.get(path)
-                    data = bytes(result.buffer())
-                    self._buffer = io.BytesIO(data)
-                    return
-
-            # Parallel chunked requests
-            if file_size == 0:
-                data = b""
-            else:
-                # Calculate chunk boundaries
-                num_chunks = (file_size + chunk_size - 1) // chunk_size
-
-                starts = []
-                lengths = []
-                for i in range(num_chunks):
-                    start = i * chunk_size
-                    length = min(chunk_size, file_size - start)
-                    starts.append(start)
-                    lengths.append(length)
-
-                # Fetch all chunks in parallel
-                results = store.get_ranges(path, starts=starts, lengths=lengths)
-
-                # Concatenate chunks into single buffer
-                data = b"".join(bytes(chunk) for chunk in results)
+            # Parallel range requests
+            starts = []
+            lengths = []
+            for i in range(num_requests):
+                start = i * request_size
+                length = min(request_size, file_size - start)
+                starts.append(start)
+                lengths.append(length)
+
+            # Fetch all parts in parallel
+            results = store.get_ranges(path, starts=starts, lengths=lengths)
+
+            # Concatenate into single buffer
+            data = b"".join(bytes(part) for part in results)
 
         self._buffer = io.BytesIO(data)
 
diff --git a/tests/test_registry.py b/tests/test_registry.py
@@ -476,8 +476,8 @@ async def __aiter__(self):
         yield self._data
 
 
-def test_eager_reader_with_chunk_size_and_file_size():
-    """Test EagerStoreReader uses get_ranges when chunk_size and file_size provided."""
+def test_eager_reader_with_request_size_and_file_size():
+    """Test EagerStoreReader uses get_ranges when request_size and file_size provided."""
     from obspec_utils.tracing import TracingReadableStore, RequestTrace
 
     # Create test data (16 bytes)
@@ -488,22 +488,22 @@ def test_eager_reader_with_chunk_size_and_file_size():
     trace = RequestTrace()
     traced_store = TracingReadableStore(mock_store, trace)
 
-    # Create reader with chunk_size and file_size
+    # Create reader with request_size and file_size
     reader = EagerStoreReader(
-        traced_store, "test.txt", chunk_size=4, file_size=len(data)
+        traced_store, "test.txt", request_size=4, file_size=len(data)
     )
 
     # Verify the data is correct
     assert reader.read() == data
 
     # Verify get_ranges was used (not get)
     summary = trace.summary()
-    assert summary["total_requests"] == 4  # 16 bytes / 4 byte chunks = 4 requests
+    assert summary["total_requests"] == 4  # 16 bytes / 4 byte requests = 4 requests
     assert all(r.method == "get_ranges" for r in trace.requests)
     assert summary["total_bytes"] == len(data)
 
 
-def test_eager_reader_with_chunk_size_uses_head():
+def test_eager_reader_uses_head():
     """Test EagerStoreReader uses head() to get file size when available."""
     from obspec_utils.tracing import TracingReadableStore, RequestTrace
 
@@ -515,16 +515,16 @@ def test_eager_reader_with_chunk_size_uses_head():
     trace = RequestTrace()
     traced_store = TracingReadableStore(mock_store, trace)
 
-    # Create reader with chunk_size but no file_size
+    # Create reader with request_size but no file_size
     # Store has head() method so it should be used
-    reader = EagerStoreReader(traced_store, "test.txt", chunk_size=4)
+    reader = EagerStoreReader(traced_store, "test.txt", request_size=4)
 
     # Verify the data is correct
     assert reader.read() == data
 
     # Verify get_ranges was used (head() call isn't traced, only data requests)
     summary = trace.summary()
-    assert summary["total_requests"] == 4  # 16 bytes / 4 byte chunks
+    assert summary["total_requests"] == 4  # 16 bytes / 4 byte requests
     assert all(r.method == "get_ranges" for r in trace.requests)
     assert summary["total_bytes"] == len(data)
 
@@ -541,9 +541,9 @@ def test_eager_reader_falls_back_to_single_get():
     trace = RequestTrace()
     traced_store = TracingReadableStore(mock_store, trace)
 
-    # Create reader with chunk_size but no file_size and no head()
+    # Create reader without file_size and no head()
     # Should fall back to single get() request
-    reader = EagerStoreReader(traced_store, "test.txt", chunk_size=4)
+    reader = EagerStoreReader(traced_store, "test.txt", request_size=4)
 
     # Verify the data is correct
     assert reader.read() == data
@@ -555,25 +555,25 @@ def test_eager_reader_falls_back_to_single_get():
     assert summary["total_bytes"] == len(data)
 
 
-def test_eager_reader_no_chunk_size():
-    """Test EagerStoreReader uses single get() when no chunk_size specified."""
+def test_eager_reader_small_file_uses_single_get():
+    """Test EagerStoreReader uses single get() when file fits in one request."""
     from obspec_utils.tracing import TracingReadableStore, RequestTrace
 
-    # Create test data
+    # Create test data smaller than default request_size (12 MB)
     data = b"0123456789ABCDEF"
     mock_store = MockReadableStoreWithHead(data)
 
     # Wrap with tracing
     trace = RequestTrace()
     traced_store = TracingReadableStore(mock_store, trace)
 
-    # Create reader without chunk_size
+    # Create reader with default settings - file is smaller than request_size
     reader = EagerStoreReader(traced_store, "test.txt")
 
     # Verify the data is correct
     assert reader.read() == data
 
-    # Verify single get() was used
+    # Verify single get() was used (skips concurrency overhead)
     summary = trace.summary()
     assert summary["total_requests"] == 1
     assert trace.requests[0].method == "get"
@@ -591,8 +591,8 @@ def test_eager_reader_empty_file():
     trace = RequestTrace()
     traced_store = TracingReadableStore(mock_store, trace)
 
-    # Create reader with chunk_size and file_size=0
-    reader = EagerStoreReader(traced_store, "test.txt", chunk_size=4, file_size=0)
+    # Create reader with file_size=0
+    reader = EagerStoreReader(traced_store, "test.txt", request_size=4, file_size=0)
 
     # Verify the data is empty
     assert reader.read() == b""
@@ -601,36 +601,96 @@ def test_eager_reader_empty_file():
     assert trace.total_requests == 0
 
 
-def test_eager_reader_chunk_boundaries():
-    """Test EagerStoreReader handles non-aligned chunk boundaries."""
+def test_eager_reader_request_boundaries():
+    """Test EagerStoreReader handles non-aligned request boundaries."""
     from obspec_utils.tracing import TracingReadableStore, RequestTrace
 
-    # Create test data (10 bytes, not evenly divisible by chunk_size=4)
+    # Create test data (10 bytes, not evenly divisible by request_size=4)
     data = b"0123456789"
     mock_store = MockReadableStoreWithHead(data)
 
     # Wrap with tracing
     trace = RequestTrace()
     traced_store = TracingReadableStore(mock_store, trace)
 
-    # Create reader with chunk_size=4, file_size=10
+    # Create reader with request_size=4, file_size=10
     reader = EagerStoreReader(
-        traced_store, "test.txt", chunk_size=4, file_size=len(data)
+        traced_store, "test.txt", request_size=4, file_size=len(data)
     )
 
     # Verify the data is correct
     assert reader.read() == data
 
-    # Should be 3 chunks: 0-3 (4 bytes), 4-7 (4 bytes), 8-9 (2 bytes)
+    # Should be 3 requests: 0-3 (4 bytes), 4-7 (4 bytes), 8-9 (2 bytes)
     summary = trace.summary()
     assert summary["total_requests"] == 3
     assert summary["total_bytes"] == len(data)
 
-    # Verify chunk sizes
+    # Verify request sizes
     lengths = [r.length for r in trace.requests]
     assert lengths == [4, 4, 2]
 
 
+def test_eager_reader_max_concurrent_requests():
+    """Test EagerStoreReader caps requests at max_concurrent_requests."""
+    from obspec_utils.tracing import TracingReadableStore, RequestTrace
+
+    # Create test data (100 bytes)
+    data = b"x" * 100
+    mock_store = MockReadableStoreWithHead(data)
+
+    # Wrap with tracing
+    trace = RequestTrace()
+    traced_store = TracingReadableStore(mock_store, trace)
+
+    # With request_size=10, would need 10 requests
+    # But max_concurrent_requests=4, so should redistribute to 4 requests
+    reader = EagerStoreReader(
+        traced_store,
+        "test.txt",
+        request_size=10,
+        file_size=len(data),
+        max_concurrent_requests=4,
+    )
+
+    # Verify the data is correct
+    assert reader.read() == data
+
+    # Should be capped at 4 requests
+    summary = trace.summary()
+    assert summary["total_requests"] == 4
+    assert summary["total_bytes"] == len(data)
+
+
+def test_eager_reader_redistribution_even_split():
+    """Test EagerStoreReader redistributes evenly when capping requests."""
+    from obspec_utils.tracing import TracingReadableStore, RequestTrace
+
+    # Create test data (100 bytes)
+    data = b"x" * 100
+    mock_store = MockReadableStoreWithHead(data)
+
+    # Wrap with tracing
+    trace = RequestTrace()
+    traced_store = TracingReadableStore(mock_store, trace)
+
+    # With request_size=10, would need 10 requests
+    # With max_concurrent_requests=4, should get 4 requests of 25 bytes each
+    reader = EagerStoreReader(
+        traced_store,
+        "test.txt",
+        request_size=10,
+        file_size=len(data),
+        max_concurrent_requests=4,
+    )
+
+    assert reader.read() == data
+
+    # Verify redistributed request sizes (25, 25, 25, 25)
+    lengths = [r.length for r in trace.requests]
+    assert lengths == [25, 25, 25, 25]
+
+
 @pytest.mark.parametrize("ReaderClass", ALL_READERS)
 def test_reader_context_manager(ReaderClass):
     """Test that readers work as context managers and release resources."""