stream_util changes

David Eigen · David Eigen · commit 7fbd78cd46ba · 2025-02-04T21:58:38.000-05:00
diff --git a/clarifai/runners/models/model_servicer.py b/clarifai/runners/models/model_servicer.py
@@ -3,7 +3,8 @@
 from clarifai_grpc.grpc.api import service_pb2, service_pb2_grpc
 from clarifai_grpc.grpc.api.status import status_code_pb2, status_pb2
 
-from ..utils.url_fetcher import ensure_urls_downloaded, map_stream
+from ..utils.stream_utils import readahead
+from ..utils.url_fetcher import ensure_urls_downloaded
 
 
 class ModelServicer(service_pb2_grpc.V2Servicer):
@@ -68,7 +69,7 @@ def StreamModelOutputs(self,
 
     # Download any urls that are not already bytes.
     def _download_urls_stream(requests):
-      yield from map_stream(ensure_urls_downloaded, requests)
+      return readahead(map(ensure_urls_downloaded, requests))
 
     try:
       return self.model_class.stream(_download_urls_stream(request))
diff --git a/clarifai/runners/utils/stream_utils.py b/clarifai/runners/utils/stream_utils.py
@@ -1,27 +1,31 @@
 import io
+import queue
 
-import requests
+import threading
+from concurrent.futures import ThreadPoolExecutor
 
 MB = 1024 * 1024
 
 
-class BufferStream(io.RawIOBase):
+class StreamingChunksReader(io.RawIOBase):
   '''
-    A buffer that reads data from a chunked stream and provides a file-like interface for reading.
+  A buffered reader that reads data from an iterator yielding chunks of bytes, used
+  to provide file-like access to a streaming data source.
 
-    :param chunk_iterator: An iterator that yields chunks of data (bytes)
-    '''
+  :param chunk_iterator: An iterator that yields chunks of data (bytes)
+  '''
 
   def __init__(self, chunk_iterator):
+    """
+    Args:
+      chunk_iterator (iterator): An iterator that yields chunks of bytes.
+    """
     self._chunk_iterator = chunk_iterator
     self.response = None
     self.buffer = b''
-    self.file_pos = 0
     self.b_pos = 0
     self._eof = False
 
-  #### read() methods
-
   def readable(self):
     return True
 
@@ -36,7 +40,7 @@ def readinto(self, output_buf):
         self.b_pos = 0
 
       # copy data to output buffer
-      n = min(len(output_buf), len(self.buffer - self.b_pos))
+      n = min(len(output_buf), len(self.buffer) - self.b_pos)
       assert n > 0
 
       output_buf[:n] = self.buffer[self.b_pos:self.b_pos + n]
@@ -52,16 +56,21 @@ def readinto(self, output_buf):
       return 0
 
 
-class SeekableBufferStream(io.RawIOBase):
-  '''
-    EXPERIMENTAL
-    A buffer that reads data from a chunked stream and provides a file-like interface for reading.
+class SeekableStreamingChunksReader(io.RawIOBase):
+  """
+  A buffered reader that reads data from an iterator yielding chunks of bytes, used
+  to provide file-like access to a streaming data source.
 
-    :param chunk_iterator: An iterator that yields chunks of data (bytes)
-    :param buffer_size: The maximum size of the buffer in bytes
-    '''
+  This class allows supports limited seeking to positions within the stream, by buffering
+  buffering chunks internally and supporting basic seek operations within the buffer.
+  """
 
   def __init__(self, chunk_iterator, buffer_size=100 * MB):
+    """
+    Args:
+      chunk_iterator (iterator): An iterator that yields chunks of bytes.
+      buffer_size (int): Maximum buffer size in bytes before old chunks are discarded.
+    """
     self._chunk_iterator = chunk_iterator
     self.buffer_size = buffer_size
     self.buffer_vec = []
@@ -76,6 +85,15 @@ def readable(self):
     return True
 
   def readinto(self, output_buf):
+    """
+    Read data into the given buffer.
+
+    Args:
+      output_buf (bytearray): Buffer to read data into.
+
+    Returns:
+      int: Number of bytes read.
+    """
     if self._eof:
       return 0
 
@@ -107,7 +125,7 @@ def readinto(self, output_buf):
   def _load_next_chunk(self, check_bounds=True):
     self.buffer_vec.append(next(self._chunk_iterator))
     total = sum(len(chunk) for chunk in self.buffer_vec)
-    while total > self.buffer_size:
+    while total > self.buffer_size and len(self.buffer_vec) > 1:  # keep at least the last chunk
       chunk = self.buffer_vec.pop(0)
       total -= len(chunk)
       self.vec_pos -= 1
@@ -123,15 +141,27 @@ def tell(self):
     return self.file_pos
 
   def seek(self, offset, whence=io.SEEK_SET):
-    #printerr(f"seek(offset={offset}, whence={('SET', 'CUR', 'END')[whence]})")
-    # convert to offset from start of file stream
+    """
+    Seek to a new position in the buffered stream.
+
+    Args:
+      offset (int): The offset to seek to.
+      whence (int): The reference position (SEEK_SET, SEEK_CUR).
+        SEEK_END is not supported.
+
+    Returns:
+      int: The new file position.
+
+    Raises:
+      ValueError: If an invalid `whence` value is provided.
+      IOError: If seeking before the start of the buffer.
+    """
     if whence == io.SEEK_SET:
       seek_pos = offset
     elif whence == io.SEEK_CUR:
       seek_pos = self.file_pos + offset
     elif whence == io.SEEK_END:
-      self._seek_to_end()
-      seek_pos = self.file_pos + offset
+      raise ValueError('SEEK_END is not supported')
     else:
       raise ValueError(f"Invalid whence: {whence}")
 
@@ -163,37 +193,47 @@ def seek(self, offset, whence=io.SEEK_SET):
 
     return self.file_pos
 
-  def _seek_to_end(self):
-    try:
-      # skip positions to end of the current buffer vec
-      if self.b_pos > 0:
-        self.file_pos += len(self.buffer_vec[self.vec_pos]) - self.b_pos
-        self.vec_pos += 1
-        self.b_pos = 0
-      # keep loading chunks until EOF
-      while True:
-        while self.vec_pos < len(self.buffer_vec):
-          self.file_pos += len(self.buffer_vec[self.vec_pos])
-          self.vec_pos += 1
-        self._load_next_chunk(check_bounds=False)
-    except StopIteration:
-      pass
-    # advance to end of buffer vec
-    while self.vec_pos < len(self.buffer_vec):
-      self.file_pos += len(self.buffer_vec[self.vec_pos])
-      self.vec_pos += 1
 
+def readahead(iterator, n=1, daemon=True):
+  """
+  Iterator wrapper that reads ahead from the underlying iterator, using a background thread.
+
+  :Args:
+    iterator (iterator): The iterator to read from.
+    n (int): The maximum number of items to read ahead.
+    daemon (bool): Whether the background thread should be a daemon thread.
+  """
+  q = queue.Queue(maxsize=n)
+  _sentinel = object()
+
+  def _read():
+    for x in iterator:
+      q.put(x)
+    q.put(_sentinel)
 
-class URLStream(BufferStream):
+  t = threading.Thread(target=_read, daemon=daemon)
+  t.start()
+  while True:
+    x = q.get()
+    if x is _sentinel:
+      break
+    yield x
 
-  def __init__(self, url, chunk_size=1 * MB, buffer_size=10 * MB, requests_kwargs={}):
-    self.url = url
-    self.chunk_size = chunk_size
-    self.response = requests.get(self.url, stream=True, **requests_kwargs)
-    self.response.raise_for_status()
-    super().__init__(
-        self.response.iter_content(chunk_size=self.chunk_size), buffer_size=buffer_size)
 
-  def close(self):
-    super().close()
-    self.response.close()
+def map(f, iterator, parallel=1):
+  '''
+  Apply a function to each item in an iterator, optionally using multiple threads.
+  Similar to the built-in `map` function, but with support for parallel execution.
+  '''
+  if parallel < 1:
+    return map(f, iterator)
+  with ThreadPoolExecutor(max_workers=parallel) as executor:
+    futures = []
+    for i in range(parallel):
+      futures.append(executor.submit(f, next(iterator)))
+    for r in iterator:
+      res = futures.pop(0).result()
+      futures.append(executor.submit(f, r))  # start computing next result before yielding this one
+      yield res
+    for f in futures:
+      yield f.result()
diff --git a/clarifai/runners/utils/url_fetcher.py b/clarifai/runners/utils/url_fetcher.py
@@ -1,7 +1,9 @@
 import concurrent.futures
+from typing import Iterable
 
 import fsspec
 
+from clarifai.runners.utils import MB
 from clarifai.utils.logging import logger
 
 
@@ -50,20 +52,10 @@ def ensure_urls_downloaded(request, max_threads=128):
   return request
 
 
-def map_stream(f, it, parallel=1):
-  '''
-  Applies f to each element of it, yielding the results in order.
-  If parallel >= 1, uses a ThreadPoolExecutor to apply f in parallel to the current thread.
-  '''
-  if parallel < 1:
-    return map(f, it)
-  with ThreadPoolExecutor(max_workers=parallel) as executor:
-    futures = []
-    for i in range(parallel):
-      futures.append(executor.submit(f, next(it)))
-    for r in it:
-      res = futures.pop(0).result()
-      futures.append(executor.submit(f, r))  # start computing next result before yielding this one
-      yield res
-    for f in futures:
-      yield f.result()
+def stream_url(url: str, chunk_size: int = 1 * MB) -> Iterable[bytes]:
+  """
+  Opens a stream of byte chunks from a URL.
+  """
+  # block_size=0 means that the file is streamed
+  with fsspec.open(url, 'rb', block_size=0) as f:
+    yield from iter(lambda: f.read(chunk_size), b'')