scaleapi
diff --git a/‎docs/guides/completions.md‎
Lines changed: 7 additions & 2 deletions b/‎docs/guides/completions.md‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎model-engine/model_engine_server/api/llms_v1.py‎
Lines changed: 35 additions & 8 deletions b/‎model-engine/model_engine_server/api/llms_v1.py‎
Lines changed: 35 additions & 8 deletions
diff --git a/‎model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py‎
Lines changed: 71 additions & 46 deletions b/‎model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py‎
Lines changed: 71 additions & 46 deletions
diff --git a/‎model-engine/tests/unit/api/conftest.py‎
Lines changed: 80 additions & 0 deletions b/‎model-engine/tests/unit/api/conftest.py‎
Lines changed: 80 additions & 0 deletions
@@ -67,7 +67,11 @@ applications. When streaming, tokens will be sent as data-only
 
 To enable token streaming, pass `stream=True` to either [Completion.create](../../api/python_client/#llmengine.completion.Completion.create) or [Completion.acreate](../../api/python_client/#llmengine.completion.Completion.acreate).
 
-Note that errors from streaming calls are returned back to the user as plain-text messages and currently need to be handled by the client.
+### Streaming Error Handling
+
+Note: Error handling semantics are mixed for streaming calls:
+- Errors that arise *before* streaming begins are returned back to the user as `HTTP` errors with the appropriate status code.
+- Errors that arise *after* streaming begins within a `HTTP 200` response are returned back to the user as plain-text messages and currently need to be handled by the client. 
 
 An example of token streaming using the synchronous Completions API looks as follows:
 
@@ -78,6 +82,7 @@ import sys
 
 from llmengine import Completion
 
+# errors occurring before streaming begins will be thrown here
 stream = Completion.create(
     model="llama-2-7b",
     prompt="Give me a 200 word summary on the current economic events in the US.",
@@ -90,7 +95,7 @@ for response in stream:
     if response.output:
         print(response.output.text, end="")
         sys.stdout.flush()
-    else: # an error occurred
+    else: # an error occurred after streaming began
         print(response.error) # print the error message out 
         break
 ```
 
@@ -405,7 +405,29 @@ async def create_completion_stream_task(
         llm_model_endpoint_service=external_interfaces.llm_model_endpoint_service,
         tokenizer_repository=external_interfaces.tokenizer_repository,
     )
-    response = use_case.execute(user=auth, model_endpoint_name=model_endpoint_name, request=request)
+
+    try:
+        # Call execute() with await, since it needs to handle exceptions before we begin streaming the response below.
+        # execute() will create a response chunk generator and return a reference to it.
+        response = await use_case.execute(
+            user=auth, model_endpoint_name=model_endpoint_name, request=request
+        )
+    except (ObjectNotFoundException, ObjectNotAuthorizedException) as exc:
+        raise HTTPException(
+            status_code=404,
+            detail=str(exc),
+        ) from exc
+    except EndpointUnsupportedInferenceTypeException as exc:
+        raise HTTPException(
+            status_code=400,
+            detail=str(exc),
+        ) from exc
+    except ObjectHasInvalidValueException as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    except Exception as exc:
+        raise HTTPException(
+            status_code=500, detail="Internal error occurred. Our team has been notified."
+        ) from exc
 
     async def event_generator():
         try:
@@ -427,14 +449,19 @@ async def event_generator():
                 ),
                 metric_metadata,
             )
-        except (InvalidRequestException, ObjectHasInvalidValueException) as exc:
+        # The following two exceptions are only raised after streaming begins, so we wrap the exception within a Response object
+        except InvalidRequestException as exc:
             yield handle_streaming_exception(exc, 400, str(exc))
-        except (
-            ObjectNotFoundException,
-            ObjectNotAuthorizedException,
-            EndpointUnsupportedInferenceTypeException,
-        ) as exc:
-            yield handle_streaming_exception(exc, 404, str(exc))
+        except UpstreamServiceError as exc:
+            request_id = LoggerTagManager.get(LoggerTagKey.REQUEST_ID)
+            logger.exception(
+                f"Upstream service error for request {request_id}. Error detail: {str(exc.content)}"
+            )
+            yield handle_streaming_exception(
+                exc,
+                500,
+                f"Upstream service error for request_id {request_id}",
+            )
         except Exception as exc:
             yield handle_streaming_exception(
                 exc, 500, "Internal error occurred. Our team has been notified."
 
@@ -78,7 +78,10 @@
     ObjectNotFoundException,
     UpstreamServiceError,
 )
-from model_engine_server.domain.gateways import DockerImageBatchJobGateway
+from model_engine_server.domain.gateways import (
+    DockerImageBatchJobGateway,
+    StreamingModelEndpointInferenceGateway,
+)
 from model_engine_server.domain.gateways.llm_artifact_gateway import LLMArtifactGateway
 from model_engine_server.domain.repositories import (
     DockerImageBatchJobBundleRepository,
@@ -1845,18 +1848,27 @@ async def execute(
     ) -> AsyncIterable[CompletionStreamV1Response]:
         """
         Runs the use case to create a stream inference task.
+        NOTE: Must be called with await(), since the function is not a generator itself, but rather creates one and
+        returns a reference to it. This structure allows exceptions that occur before response streaming begins
+        to propagate to the client as HTTP exceptions with the appropriate code.
 
         Args:
             user: The user who is creating the stream inference task.
             model_endpoint_name: The name of the model endpoint for the task.
             request: The body of the request to forward to the endpoint.
 
         Returns:
-            A response object that contains the status and result of the task.
+            An asynchronous response chunk generator, containing response objects to be iterated through with 'async for'.
+            Each response object contains the status and result of the task.
 
         Raises:
             ObjectNotFoundException: If a model endpoint with the given name could not be found.
+            ObjectHasInvalidValueException: If there are multiple model endpoints with the given name.
             ObjectNotAuthorizedException: If the owner does not own the model endpoint.
+            EndpointUnsupportedInferenceTypeException: If the model endpoint does not support streaming or uses
+                an unsupported inference framework.
+            UpstreamServiceError: If an error occurs upstream in the streaming inference API call.
+            InvalidRequestException: If request validation fails during inference.
         """
 
         request_id = LoggerTagManager.get(LoggerTagKey.REQUEST_ID)
@@ -2020,7 +2032,6 @@ async def execute(
                 model_content.model_name,
                 self.tokenizer_repository,
             )
-
         else:
             raise EndpointUnsupportedInferenceTypeException(
                 f"Unsupported inference framework {model_content.inference_framework}"
@@ -2031,15 +2042,55 @@ async def execute(
             num_retries=NUM_DOWNSTREAM_REQUEST_RETRIES,
             timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
         )
+
+        return self._response_chunk_generator(
+            request=request,
+            request_id=request_id,
+            model_endpoint=model_endpoint,
+            model_content=model_content,
+            inference_gateway=inference_gateway,
+            inference_request=inference_request,
+            num_prompt_tokens=num_prompt_tokens,
+        )
+
+    async def _response_chunk_generator(
+        self,
+        request: CompletionStreamV1Request,
+        request_id: Optional[str],
+        model_endpoint: ModelEndpoint,
+        model_content: GetLLMModelEndpointV1Response,
+        inference_gateway: StreamingModelEndpointInferenceGateway,
+        inference_request: SyncEndpointPredictV1Request,
+        num_prompt_tokens: Optional[int],
+    ) -> AsyncIterable[CompletionStreamV1Response]:
+        """
+        Async generator yielding tokens to stream for the completions response. Should only be called when
+        returned directly by execute().
+        """
         predict_result = inference_gateway.streaming_predict(
             topic=model_endpoint.record.destination, predict_request=inference_request
         )
 
         num_completion_tokens = 0
         async for res in predict_result:
-            result = res.result
-            if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED:
-                if res.status == TaskStatus.SUCCESS and result is not None:
+            if not res.status == TaskStatus.SUCCESS or res.result is None:
+                # Raise an UpstreamServiceError if the task has failed
+                if res.status == TaskStatus.FAILURE:
+                    raise UpstreamServiceError(
+                        status_code=500,
+                        content=(
+                            res.traceback.encode("utf-8") if res.traceback is not None else b""
+                        ),
+                    )
+                # Otherwise, yield empty response chunk for unsuccessful or empty results
+                yield CompletionStreamV1Response(
+                    request_id=request_id,
+                    output=None,
+                )
+            else:
+                result = res.result
+                # DEEPSPEED
+                if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED:
                     if "token" in result["result"]:
                         yield CompletionStreamV1Response(
                             request_id=request_id,
@@ -2063,15 +2114,11 @@ async def execute(
                                 num_completion_tokens=completion_token_count,
                             ),
                         )
-                else:
-                    yield CompletionStreamV1Response(
-                        request_id=request_id,
-                        output=None,
-                    )
-            elif (
-                model_content.inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE
-            ):
-                if res.status == TaskStatus.SUCCESS and result is not None:
+                # TEXT_GENERATION_INTERFACE
+                elif (
+                    model_content.inference_framework
+                    == LLMInferenceFramework.TEXT_GENERATION_INFERENCE
+                ):
                     if result["result"].get("generated_text") is not None:
                         finished = True
                     else:
@@ -2108,14 +2155,8 @@ async def execute(
                             raise UpstreamServiceError(
                                 status_code=500, content=result.get("error")
                             )  # also change llms_v1.py that will return a 500 HTTPException so user can retry
-
-                else:
-                    yield CompletionStreamV1Response(
-                        request_id=request_id,
-                        output=None,
-                    )
-            elif model_content.inference_framework == LLMInferenceFramework.VLLM:
-                if res.status == TaskStatus.SUCCESS and result is not None:
+                # VLLM
+                elif model_content.inference_framework == LLMInferenceFramework.VLLM:
                     token = None
                     if request.return_token_log_probs:
                         token = TokenOutput(
@@ -2134,13 +2175,8 @@ async def execute(
                             token=token,
                         ),
                     )
-                else:
-                    yield CompletionStreamV1Response(
-                        request_id=request_id,
-                        output=None,
-                    )
-            elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
-                if res.status == TaskStatus.SUCCESS and result is not None:
+                # LIGHTLLM
+                elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
                     token = None
                     num_completion_tokens += 1
                     if request.return_token_log_probs:
@@ -2159,13 +2195,8 @@ async def execute(
                             token=token,
                         ),
                     )
-                else:
-                    yield CompletionStreamV1Response(
-                        request_id=request_id,
-                        output=None,
-                    )
-            elif model_content.inference_framework == LLMInferenceFramework.TENSORRT_LLM:
-                if res.status == TaskStatus.SUCCESS and result is not None:
+                # TENSORRT_LLM
+                elif model_content.inference_framework == LLMInferenceFramework.TENSORRT_LLM:
                     num_completion_tokens += 1
                     yield CompletionStreamV1Response(
                         request_id=request_id,
@@ -2176,15 +2207,9 @@ async def execute(
                             num_completion_tokens=num_completion_tokens,
                         ),
                     )
-                else:
-                    yield CompletionStreamV1Response(
-                        request_id=request_id,
-                        output=None,
-                    )
-            else:
-                raise EndpointUnsupportedInferenceTypeException(
-                    f"Unsupported inference framework {model_content.inference_framework}"
-                )
+                # No else clause needed for an unsupported inference framework, since we check
+                # model_content.inference_framework in execute() prior to calling _response_chunk_generator,
+                # raising an exception if it is not one of the frameworks handled above.
 
 
 class ModelDownloadV1UseCase:
 
@@ -1,10 +1,13 @@
+import asyncio
 import datetime
 from typing import Any, Dict, Iterator, Tuple
 
 import pytest
+import pytest_asyncio
 from fastapi import Depends, HTTPException
 from fastapi.security import HTTPBasicCredentials
 from fastapi.testclient import TestClient
+from httpx import AsyncClient
 from model_engine_server.api.app import app
 from model_engine_server.api.dependencies import (
     AUTH,
@@ -90,6 +93,14 @@ def fake_auth():
         app.dependency_overrides[verify_authentication] = {}
 
 
+@pytest_asyncio.fixture(scope="session", autouse=True)
+def event_loop(request):
+    """Create an instance of the default event loop for each test case."""
+    loop = asyncio.get_event_loop_policy().new_event_loop()
+    yield loop
+    loop.close()
+
+
 @pytest.fixture
 def get_test_client_wrapper(get_repositories_generator_wrapper):
     def get_test_client(
@@ -159,6 +170,75 @@ def get_test_client(
     return get_test_client
 
 
+@pytest.fixture
+def get_async_test_client_wrapper(get_repositories_generator_wrapper):
+    def get_async_test_client(
+        fake_docker_repository_image_always_exists=True,
+        fake_model_bundle_repository_contents=None,
+        fake_model_endpoint_record_repository_contents=None,
+        fake_model_endpoint_infra_gateway_contents=None,
+        fake_batch_job_record_repository_contents=None,
+        fake_batch_job_progress_gateway_contents=None,
+        fake_docker_image_batch_job_bundle_repository_contents=None,
+        fake_docker_image_batch_job_gateway_contents=None,
+        fake_llm_fine_tuning_service_contents=None,
+        fake_file_storage_gateway_contents=None,
+        fake_file_system_gateway_contents=None,
+        fake_trigger_repository_contents=None,
+        fake_cron_job_gateway_contents=None,
+        fake_sync_inference_content=None,
+    ) -> AsyncClient:
+        if fake_docker_image_batch_job_gateway_contents is None:
+            fake_docker_image_batch_job_gateway_contents = {}
+        if fake_docker_image_batch_job_bundle_repository_contents is None:
+            fake_docker_image_batch_job_bundle_repository_contents = {}
+        if fake_batch_job_progress_gateway_contents is None:
+            fake_batch_job_progress_gateway_contents = {}
+        if fake_batch_job_record_repository_contents is None:
+            fake_batch_job_record_repository_contents = {}
+        if fake_model_endpoint_infra_gateway_contents is None:
+            fake_model_endpoint_infra_gateway_contents = {}
+        if fake_model_endpoint_record_repository_contents is None:
+            fake_model_endpoint_record_repository_contents = {}
+        if fake_model_bundle_repository_contents is None:
+            fake_model_bundle_repository_contents = {}
+        if fake_llm_fine_tuning_service_contents is None:
+            fake_llm_fine_tuning_service_contents = {}
+        if fake_file_storage_gateway_contents is None:
+            fake_file_storage_gateway_contents = {}
+        if fake_file_system_gateway_contents is None:
+            fake_file_system_gateway_contents = {}
+        if fake_trigger_repository_contents is None:
+            fake_trigger_repository_contents = {}
+        if fake_cron_job_gateway_contents is None:
+            fake_cron_job_gateway_contents = {}
+        if fake_sync_inference_content is None:
+            fake_sync_inference_content = {}
+        app.dependency_overrides[get_external_interfaces] = get_repositories_generator_wrapper(
+            fake_docker_repository_image_always_exists=fake_docker_repository_image_always_exists,
+            fake_model_bundle_repository_contents=fake_model_bundle_repository_contents,
+            fake_model_endpoint_record_repository_contents=fake_model_endpoint_record_repository_contents,
+            fake_model_endpoint_infra_gateway_contents=fake_model_endpoint_infra_gateway_contents,
+            fake_batch_job_record_repository_contents=fake_batch_job_record_repository_contents,
+            fake_batch_job_progress_gateway_contents=fake_batch_job_progress_gateway_contents,
+            fake_docker_image_batch_job_bundle_repository_contents=fake_docker_image_batch_job_bundle_repository_contents,
+            fake_docker_image_batch_job_gateway_contents=fake_docker_image_batch_job_gateway_contents,
+            fake_llm_fine_tuning_service_contents=fake_llm_fine_tuning_service_contents,
+            fake_file_storage_gateway_contents=fake_file_storage_gateway_contents,
+            fake_file_system_gateway_contents=fake_file_system_gateway_contents,
+            fake_trigger_repository_contents=fake_trigger_repository_contents,
+            fake_cron_job_gateway_contents=fake_cron_job_gateway_contents,
+            fake_sync_inference_content=fake_sync_inference_content,
+        )
+        app.dependency_overrides[get_external_interfaces_read_only] = app.dependency_overrides[
+            get_external_interfaces
+        ]
+        client = AsyncClient(app=app, base_url="http://test")
+        return client
+
+    return get_async_test_client
+
+
 @pytest.fixture
 def simple_client(get_test_client_wrapper) -> TestClient:
     """Returns a Client with no initial contents and a Docker repository that always returns True"""