feat: add support for replicate.stream()

zeke · zeke · commit ce20b684a9bf · 2025-09-30T13:09:48.000-07:00
This PR adds support for streaming predictions via the `replicate.stream()` method.

Changes:
- Add `stream()` method to both Replicate and AsyncReplicate clients
- Add module-level `stream()` function for convenience
- Create new `lib/_predictions_stream.py` module with streaming logic
- Add comprehensive tests for sync and async streaming
- Update README with documentation and examples using anthropic/claude-4-sonnet

The stream method creates a prediction and returns an iterator that yields
output chunks as they become available via Server-Sent Events (SSE). This is
useful for language models where you want to display output as it's generated.
diff --git a/README.md b/README.md
@@ -118,14 +118,18 @@ For models that support streaming (particularly language models), you can use `r
 import replicate
 
 for event in replicate.stream(
-    "meta/meta-llama-3-70b-instruct",
+    "anthropic/claude-4-sonnet",
     input={
-        "prompt": "Please write a haiku about llamas.",
+        "prompt": "Give me a recipe for tasty smashed avocado on sourdough toast.",
+        "max_tokens": 8192,
+        "system_prompt": "You are a helpful assistant",
     },
 ):
     print(str(event), end="")
 ```
 
+The `stream()` method creates a prediction and returns an iterator that yields output chunks as they become available via Server-Sent Events (SSE). This is useful for language models where you want to display output as it's generated rather than waiting for the entire response.
+
 ## Async usage
 
 Simply import `AsyncReplicate` instead of `Replicate` and use `await` with each API call:
@@ -172,7 +176,11 @@ async def main():
 
     # Stream a model's output
     async for event in replicate.stream(
-        "meta/meta-llama-3-70b-instruct", input={"prompt": "Write a haiku about coding"}
+        "anthropic/claude-4-sonnet",
+        input={
+            "prompt": "Write a haiku about coding",
+            "system_prompt": "You are a helpful assistant",
+        },
     ):
         print(str(event), end="")
 
diff --git a/src/replicate/__init__.py b/src/replicate/__init__.py
@@ -109,7 +109,7 @@
     if not __name.startswith("__"):
         try:
             # Skip symbols that are imported later from _module_client
-            if __name in ("run", "use"):
+            if __name in ("run", "use", "stream"):
                 continue
             __locals[__name].__module__ = "replicate"
         except (TypeError, AttributeError):
@@ -253,6 +253,7 @@ def _reset_client() -> None:  # type: ignore[reportUnusedFunction]
     use as use,
     files as files,
     models as models,
+    stream as stream,
     account as account,
     hardware as hardware,
     webhooks as webhooks,
diff --git a/src/replicate/_client.py b/src/replicate/_client.py
@@ -320,6 +320,54 @@ def use(
         # TODO: Fix mypy overload matching for streaming parameter
         return _use(self, ref, hint=hint, streaming=streaming)  # type: ignore[call-overload, no-any-return]
 
+    def stream(
+        self,
+        ref: Union[Model, Version, ModelVersionIdentifier, str],
+        *,
+        file_encoding_strategy: Optional["FileEncodingStrategy"] = None,
+        **params: Unpack[PredictionCreateParamsWithoutVersion],
+    ) -> Iterator[str]:
+        """
+        Stream output from a model prediction.
+
+        This creates a prediction and returns an iterator that yields output chunks
+        as they become available via Server-Sent Events (SSE).
+
+        Args:
+            ref: Reference to the model or version to run. Can be:
+                - A string containing a version ID (e.g. "5c7d5dc6dd8bf75c1acaa8565735e7986bc5b66206b55cca93cb72c9bf15ccaa")
+                - A string with owner/name format (e.g. "replicate/hello-world")
+                - A string with owner/name:version format (e.g. "replicate/hello-world:5c7d5dc6...")
+                - A Model instance with owner and name attributes
+                - A Version instance with id attribute
+                - A ModelVersionIdentifier dictionary with owner, name, and/or version keys
+            file_encoding_strategy: Strategy for encoding file inputs, options are "base64" or "url"
+            **params: Additional parameters to pass to the prediction creation endpoint including
+                      the required "input" dictionary with model-specific parameters
+
+        Yields:
+            str: Output chunks from the model as they become available
+
+        Raises:
+            ValueError: If the reference format is invalid or model doesn't support streaming
+            ReplicateError: If the prediction fails
+
+        Example:
+            for event in replicate.stream(
+                "meta/meta-llama-3-70b-instruct",
+                input={"prompt": "Write a haiku about coding"},
+            ):
+                print(str(event), end="")
+        """
+        from .lib._predictions_stream import stream
+
+        return stream(
+            self,
+            ref,
+            file_encoding_strategy=file_encoding_strategy,
+            **params,
+        )
+
     def copy(
         self,
         *,
@@ -695,6 +743,55 @@ def use(
         # TODO: Fix mypy overload matching for streaming parameter
         return _use(self, ref, hint=hint, streaming=streaming)  # type: ignore[call-overload, no-any-return]
 
+    async def stream(
+        self,
+        ref: Union[Model, Version, ModelVersionIdentifier, str],
+        *,
+        file_encoding_strategy: Optional["FileEncodingStrategy"] = None,
+        **params: Unpack[PredictionCreateParamsWithoutVersion],
+    ) -> AsyncIterator[str]:
+        """
+        Stream output from a model prediction asynchronously.
+
+        This creates a prediction and returns an async iterator that yields output chunks
+        as they become available via Server-Sent Events (SSE).
+
+        Args:
+            ref: Reference to the model or version to run. Can be:
+                - A string containing a version ID (e.g. "5c7d5dc6dd8bf75c1acaa8565735e7986bc5b66206b55cca93cb72c9bf15ccaa")
+                - A string with owner/name format (e.g. "replicate/hello-world")
+                - A string with owner/name:version format (e.g. "replicate/hello-world:5c7d5dc6...")
+                - A Model instance with owner and name attributes
+                - A Version instance with id attribute
+                - A ModelVersionIdentifier dictionary with owner, name, and/or version keys
+            file_encoding_strategy: Strategy for encoding file inputs, options are "base64" or "url"
+            **params: Additional parameters to pass to the prediction creation endpoint including
+                      the required "input" dictionary with model-specific parameters
+
+        Yields:
+            str: Output chunks from the model as they become available
+
+        Raises:
+            ValueError: If the reference format is invalid or model doesn't support streaming
+            ReplicateError: If the prediction fails
+
+        Example:
+            async for event in replicate.stream(
+                "meta/meta-llama-3-70b-instruct",
+                input={"prompt": "Write a haiku about coding"},
+            ):
+                print(str(event), end="")
+        """
+        from .lib._predictions_stream import async_stream
+
+        async for chunk in async_stream(
+            self,
+            ref,
+            file_encoding_strategy=file_encoding_strategy,
+            **params,
+        ):
+            yield chunk
+
     def copy(
         self,
         *,
diff --git a/src/replicate/_module_client.py b/src/replicate/_module_client.py
@@ -82,6 +82,7 @@ def __load__(self) -> PredictionsResource:
     __client: Replicate = cast(Replicate, {})
     run = __client.run
     use = __client.use
+    stream = __client.stream
 else:
 
     def _run(*args, **kwargs):
@@ -100,8 +101,12 @@ def _use(ref, *, hint=None, streaming=False, use_async=False, **kwargs):
 
         return use(Replicate, ref, hint=hint, streaming=streaming, **kwargs)
 
+    def _stream(*args, **kwargs):
+        return _load_client().stream(*args, **kwargs)
+
     run = _run
     use = _use
+    stream = _stream
 
 files: FilesResource = FilesResourceProxy().__as_proxied__()
 models: ModelsResource = ModelsResourceProxy().__as_proxied__()
diff --git a/src/replicate/lib/_predictions_stream.py b/src/replicate/lib/_predictions_stream.py
@@ -0,0 +1,188 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Union, Iterator, Optional
+from collections.abc import AsyncIterator
+from typing_extensions import Unpack
+
+from replicate.lib._files import FileEncodingStrategy
+from replicate.types.prediction_create_params import PredictionCreateParamsWithoutVersion
+
+from ..types import PredictionCreateParams
+from ._models import Model, Version, ModelVersionIdentifier, resolve_reference
+
+if TYPE_CHECKING:
+    from .._client import Replicate, AsyncReplicate
+
+
+def stream(
+    client: "Replicate",
+    ref: Union[Model, Version, ModelVersionIdentifier, str],
+    *,
+    file_encoding_strategy: Optional["FileEncodingStrategy"] = None,
+    **params: Unpack[PredictionCreateParamsWithoutVersion],
+) -> Iterator[str]:
+    """
+    Stream output from a model prediction.
+
+    This creates a prediction and returns an iterator that yields output chunks
+    as they become available via Server-Sent Events (SSE).
+
+    Args:
+        client: The Replicate client instance
+        ref: Reference to the model or version to run. Can be:
+            - A string containing a version ID
+            - A string with owner/name format (e.g. "replicate/hello-world")
+            - A string with owner/name:version format
+            - A Model instance
+            - A Version instance
+            - A ModelVersionIdentifier dictionary
+        file_encoding_strategy: Strategy for encoding file inputs
+        **params: Additional parameters including the required "input" dictionary
+
+    Yields:
+        str: Output chunks from the model as they become available
+
+    Raises:
+        ValueError: If the reference format is invalid
+        ReplicateError: If the prediction fails or streaming is not available
+    """
+    # Resolve ref to its components
+    try:
+        version, owner, name, version_id = resolve_reference(ref)
+    except ValueError:
+        # If resolution fails, treat it as a version ID if it's a string
+        if isinstance(ref, str):
+            version_id = ref
+            owner = name = None
+        else:
+            raise
+
+    # Create prediction
+    prediction = None
+    if version_id is not None:
+        params_with_version: PredictionCreateParams = {**params, "version": version_id}
+        prediction = client.predictions.create(file_encoding_strategy=file_encoding_strategy, **params_with_version)
+    elif owner and name:
+        prediction = client.models.predictions.create(
+            file_encoding_strategy=file_encoding_strategy, model_owner=owner, model_name=name, **params
+        )
+    else:
+        if isinstance(ref, str):
+            params_with_version = {**params, "version": ref}
+            prediction = client.predictions.create(file_encoding_strategy=file_encoding_strategy, **params_with_version)
+        else:
+            raise ValueError(
+                f"Invalid reference format: {ref}. Expected a model name ('owner/name'), "
+                "a version ID, a Model object, a Version object, or a ModelVersionIdentifier."
+            )
+
+    # Check if streaming URL is available
+    if not prediction.urls or not prediction.urls.stream:
+        raise ValueError("Model does not support streaming. The prediction URLs do not include a stream endpoint.")
+
+    # Make SSE request to the stream URL
+    stream_url = prediction.urls.stream
+
+    with client._client.stream(
+        "GET",
+        stream_url,
+        headers={
+            "Accept": "text/event-stream",
+            "Cache-Control": "no-store",
+        },
+        timeout=None,  # No timeout for streaming
+    ) as response:
+        response.raise_for_status()
+
+        # Parse SSE events and yield output chunks
+        decoder = client._make_sse_decoder()
+        for sse in decoder.iter_bytes(response.iter_bytes()):
+            # The SSE data contains the output chunks
+            if sse.data:
+                yield sse.data
+
+
+async def async_stream(
+    client: "AsyncReplicate",
+    ref: Union[Model, Version, ModelVersionIdentifier, str],
+    *,
+    file_encoding_strategy: Optional["FileEncodingStrategy"] = None,
+    **params: Unpack[PredictionCreateParamsWithoutVersion],
+) -> AsyncIterator[str]:
+    """
+    Async stream output from a model prediction.
+
+    This creates a prediction and returns an async iterator that yields output chunks
+    as they become available via Server-Sent Events (SSE).
+
+    Args:
+        client: The AsyncReplicate client instance
+        ref: Reference to the model or version to run
+        file_encoding_strategy: Strategy for encoding file inputs
+        **params: Additional parameters including the required "input" dictionary
+
+    Yields:
+        str: Output chunks from the model as they become available
+
+    Raises:
+        ValueError: If the reference format is invalid
+        ReplicateError: If the prediction fails or streaming is not available
+    """
+    # Resolve ref to its components
+    try:
+        version, owner, name, version_id = resolve_reference(ref)
+    except ValueError:
+        # If resolution fails, treat it as a version ID if it's a string
+        if isinstance(ref, str):
+            version_id = ref
+            owner = name = None
+        else:
+            raise
+
+    # Create prediction
+    prediction = None
+    if version_id is not None:
+        params_with_version: PredictionCreateParams = {**params, "version": version_id}
+        prediction = await client.predictions.create(
+            file_encoding_strategy=file_encoding_strategy, **params_with_version
+        )
+    elif owner and name:
+        prediction = await client.models.predictions.create(
+            file_encoding_strategy=file_encoding_strategy, model_owner=owner, model_name=name, **params
+        )
+    else:
+        if isinstance(ref, str):
+            params_with_version = {**params, "version": ref}
+            prediction = await client.predictions.create(
+                file_encoding_strategy=file_encoding_strategy, **params_with_version
+            )
+        else:
+            raise ValueError(
+                f"Invalid reference format: {ref}. Expected a model name ('owner/name'), "
+                "a version ID, a Model object, a Version object, or a ModelVersionIdentifier."
+            )
+
+    # Check if streaming URL is available
+    if not prediction.urls or not prediction.urls.stream:
+        raise ValueError("Model does not support streaming. The prediction URLs do not include a stream endpoint.")
+
+    # Make SSE request to the stream URL
+    stream_url = prediction.urls.stream
+
+    async with client._client.stream(
+        "GET",
+        stream_url,
+        headers={
+            "Accept": "text/event-stream",
+            "Cache-Control": "no-store",
+        },
+        timeout=None,  # No timeout for streaming
+    ) as response:
+        response.raise_for_status()
+
+        # Parse SSE events and yield output chunks
+        decoder = client._make_sse_decoder()
+        async for sse in decoder.aiter_bytes(response.aiter_bytes()):
+            # The SSE data contains the output chunks
+            if sse.data:
+                yield sse.data
diff --git a/tests/lib/test_stream.py b/tests/lib/test_stream.py