OpenCSGs
diff --git a/‎llmserve/backend/llm/engines/_base.py‎
Lines changed: 10 additions & 3 deletions b/‎llmserve/backend/llm/engines/_base.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎llmserve/backend/llm/engines/generic.py‎
Lines changed: 85 additions & 20 deletions b/‎llmserve/backend/llm/engines/generic.py‎
Lines changed: 85 additions & 20 deletions
diff --git a/‎llmserve/backend/llm/engines/vllm/vllm.py‎
Lines changed: 12 additions & 2 deletions b/‎llmserve/backend/llm/engines/vllm/vllm.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎llmserve/backend/llm/pipelines/_base.py‎
Lines changed: 1 addition & 6 deletions b/‎llmserve/backend/llm/pipelines/_base.py‎
Lines changed: 1 addition & 6 deletions
@@ -6,11 +6,11 @@
 
 from llmserve.backend.logger import get_logger
 
-from typing import List, Optional
+from typing import List, Optional, Iterator
 from ray.air import ScalingConfig
 
 from llmserve.backend.logger import get_logger
-from llmserve.backend.server.models import Args, Prompt
+from llmserve.backend.server.models import Args, Prompt, Response
 import asyncio
 from typing import Union, AsyncGenerator, Generator
 
@@ -67,5 +67,12 @@ async def check_health(self):
         pass
 
     @abstractmethod
-    def stream_generate_texts(self, prompt: Union[Prompt, List[Prompt]]) -> Generator[str, None, None]:
+    async def stream(
+        self,
+        prompts: List[Prompt],
+        *,
+        timeout_s: float = 60,
+        start_timestamp: Optional[float] = None,
+        lock: asyncio.Lock,
+    ) -> Iterator[List[Response]]:
         pass
@@ -10,7 +10,7 @@
 import gc
 import os
 import traceback
-from typing import List, Optional
+from typing import List, Optional, Iterator
 
 import ray
 import ray.util
@@ -21,7 +21,7 @@
 
 from llmserve.backend.llm.initializers import get_initializer_cls_by_name
 from llmserve.backend.llm.pipelines import get_pipeline_cls_by_name
-from llmserve.backend.llm.pipelines._base import BasePipeline
+from llmserve.backend.llm.pipelines._base import BasePipeline, StreamingPipeline
 from llmserve.backend.llm.utils import (
     init_torch_dist_process_group_async,
     timeit,
@@ -171,7 +171,26 @@ def generate(
     )
     return outputs
 
-import logging
+@timeit
+def stream(
+    prompts: List[Prompt],
+    pipeline: BasePipeline,
+    **generate_kwargs,
+) -> Iterator[List[Response]]:
+    """Generate predictions using a Pipeline.
+
+    Args:
+        prompts (List[Prompt]): List of prompts.
+        pipeline (BasePipeline): Pipeline to use.
+        **generate_kwargs: Keyword arguments to pass to the pipeline's `generate` method.
+    """
+    if not isinstance(pipeline, StreamingPipeline):
+        raise RuntimeError(f"Pipeline {pipeline} does not support streaming.")
+    yield from pipeline.stream(
+        prompts,
+        **generate_kwargs,
+    )
+
 @ray.remote
 class PredictionWorker(TorchDistributedWorker):
     """A PredictionWorker is a Ray remote actor that runs a single shard of a DeepSpeed job.
@@ -277,21 +296,36 @@ def generate(
                 )
                 return responses_1 + responses_2
 
+    def stream(
+        self,
+        data: List[Prompt],
+        *,
+        timeout_s: Optional[float] = None,
+        start_timestamp: Optional[float] = None,
+        **kwargs,
+    ) -> Iterator[List[Response]]:
+        yield from stream(
+            data,
+            self.generator,
+            timeout_s=timeout_s,
+            start_timestamp=start_timestamp,
+            **kwargs,
+        )
+
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}:{self.llm_config.model_id}"
 
     def ping(self) -> bool:
         """Ping the worker."""
         return True
 
-    async def worker_stream_generate_texts(self, prompt: Union[Prompt, List[Prompt]], **kwargs) -> Generator[str, None, None]: # type: ignore
-        logger.info(f"Call PredictionWorker.worker_stream_generate_texts with kwargs: {kwargs}")
-        for s in self.generator.streamGenerate(prompt, **kwargs):
-            # logger.info(f"PredictionWorker.worker_stream_generate_texts -> yield ->{s}")
-            yield s
+    def can_stream(self) -> bool:
+        """Whether the worker can stream."""
+        return isinstance(self.generator, StreamingPipeline)
 
 class GenericEngine(LLMEngine):
     base_worker_group = None
+    can_stream = None
 
     async def launch_engine(
             self, 
@@ -338,11 +372,11 @@ async def launch_engine(
                     num_gpus_per_worker=scaling_config.num_gpus_per_worker
                 )
                 for worker, local_rank in zip(worker_group, local_ranks)
-                # for worker in worker_group
             ]
         )
 
         self.base_worker_group = worker_group
+        self.can_stream = await asyncio.gather(*[worker_group[0].can_stream.remote()])
         return worker_group
 
     async def predict(
@@ -429,14 +463,45 @@ async def check_health(self):
                     f"At least one prediction worker is dead. Dead workers: {dead_actors}. "
                     "Reinitializing worker group."
                 )
-    
-    def stream_generate_texts(self, prompt: Union[Prompt, List[Prompt]]) -> Generator[str, None, None]: # type: ignore
-        logger.info(f"GenericEngine.stream_generate_texts -> worker.length: {len(self.base_worker_group)}")
-        worker0 = self.base_worker_group[0]
-        for strHandle in worker0.worker_stream_generate_texts.remote(
-            prompt,
-            **self.args.model_config.generation.all_generate_kwargs if self.args.model_config.generation else {}
-        ):
-            val = ray.get(strHandle)
-            logger.info(f"GenericEngine.stream_generate_texts -> yield -> {val}")
-            yield val
+
+    async def stream(
+        self,
+        prompts: List[Prompt],
+        *,
+        timeout_s: float = 60,
+        start_timestamp: Optional[float] = None,
+        lock: asyncio.Lock,
+    ) -> Iterator[List[Response]]:
+        """Generate text for a list of prompts.
+
+        Args:
+            prompts (List[Prompt]): Batch of prompts to generate text from.
+            timeout_s (float, optional): Timeout for the generation. Defaults
+                to 60. Ignored if start_timestamp is None.
+            start_timestamp (Optional[float], optional): Timestamp of when the
+                batch was created. Defaults to None. If set, will early stop
+                the generation.
+
+        Returns:
+            A list of generated texts.
+        """
+        if self.can_stream:
+            async with lock:
+                tasks = [
+                    worker.stream.options(num_returns="streaming").remote(
+                        prompts,
+                        timeout_s=timeout_s,
+                        start_timestamp=start_timestamp,
+                        **self.args.model_config.generation.all_generate_kwargs,
+                    )
+                    for worker in self.base_worker_group
+                ]
+                async for result in tasks[0]:
+                    yield await result
+        else:
+            logger.warning(
+                f"Pipeline {self.args.model_config.initialization.pipeline} does not support streaming. Ignoring queue."
+            )
+            yield await self.predict(
+                prompts, timeout_s=timeout_s, start_timestamp=start_timestamp
+            )
@@ -3,7 +3,7 @@
 import asyncio
 import torch
 import gc
-from typing import List, Optional, Any, Dict, List, Optional, AsyncIterator
+from typing import List, Optional, Any, Dict, List, Optional, AsyncIterator, Iterator
 from ray.air import ScalingConfig
 from ray.util.placement_group import PlacementGroup
 from llmserve.backend.server.models import Args, Prompt, Response
@@ -225,4 +225,14 @@ async def predict(
         return responses
 
     async def check_health(self):
-        logger.info("not implements yet...")
+        logger.info("not implements yet...")
+
+    async def stream(
+        self,
+        prompts: List[Prompt],
+        *,
+        timeout_s: float = 60,
+        start_timestamp: Optional[float] = None,
+        lock: asyncio.Lock,
+    ) -> Iterator[List[Response]]:
+        pass
@@ -340,15 +340,10 @@ def _sanitize_parameters(
 
         return preprocess_params, forward_params, postprocess_params
 
-    @abstractmethod
-    def streamGenerate(self, prompt: Union[Prompt, List[Prompt]], **generate_kwargs) -> Generator[str, None, None]:
-        pass
-
 class StreamingPipeline(BasePipeline):
     def stream(
         self,
-        inputs: List[str],
-        queue: Queue,
+        inputs: List[Union[str, Prompt]],
         **kwargs,
     ) -> Iterator[List[Response]]:
         raise NotImplementedError()