Return TGI errors (#313)

yunfeng-scale · web-flow · commit 2f5dd72efc9f · 2023-10-11T14:49:56.000-07:00
* Return TGI errors

* remove prints

* fix lint
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -909,7 +909,10 @@ def validate_and_update_completion_params(
         if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
             request.top_k = None if request.top_k == -1 else request.top_k
             request.top_p = None if request.top_p == 1.0 else request.top_p
-        if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
+        if inference_framework in [
+            LLMInferenceFramework.VLLM,
+            LLMInferenceFramework.LIGHTLLM,
+        ]:
             request.top_k = -1 if request.top_k is None else request.top_k
             request.top_p = 1.0 if request.top_p is None else request.top_p
     else:
@@ -919,7 +922,10 @@ def validate_and_update_completion_params(
             )
 
     # presence_penalty, frequency_penalty
-    if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
+    if inference_framework in [
+        LLMInferenceFramework.VLLM,
+        LLMInferenceFramework.LIGHTLLM,
+    ]:
         request.presence_penalty = (
             0.0 if request.presence_penalty is None else request.presence_penalty
         )
@@ -987,14 +993,17 @@ def model_output_to_completion_output(
                     raise InvalidRequestException(model_output.get("error"))  # trigger a 400
                 else:
                     raise UpstreamServiceError(
-                        status_code=500, content=bytes(model_output["error"])
+                        status_code=500, content=bytes(model_output["error"], "utf-8")
                     )
 
         elif model_content.inference_framework == LLMInferenceFramework.VLLM:
             tokens = None
             if with_token_probs:
                 tokens = [
-                    TokenOutput(token=model_output["tokens"][index], log_prob=list(t.values())[0])
+                    TokenOutput(
+                        token=model_output["tokens"][index],
+                        log_prob=list(t.values())[0],
+                    )
                     for index, t in enumerate(model_output["log_probs"])
                 ]
             return CompletionOutput(
@@ -1003,7 +1012,6 @@ def model_output_to_completion_output(
                 tokens=tokens,
             )
         elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
-            print(model_output)
             tokens = None
             if with_token_probs:
                 tokens = [
@@ -1109,7 +1117,8 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination, predict_request=inference_request
+                topic=model_endpoint.record.destination,
+                predict_request=inference_request,
             )
 
             if predict_result.status == TaskStatus.SUCCESS and predict_result.result is not None:
@@ -1152,7 +1161,8 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination, predict_request=inference_request
+                topic=model_endpoint.record.destination,
+                predict_request=inference_request,
             )
 
             if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
@@ -1191,7 +1201,8 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination, predict_request=inference_request
+                topic=model_endpoint.record.destination,
+                predict_request=inference_request,
             )
 
             if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
@@ -1233,7 +1244,8 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination, predict_request=inference_request
+                topic=model_endpoint.record.destination,
+                predict_request=inference_request,
             )
 
             if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
@@ -1517,7 +1529,6 @@ async def execute(
                     )
             elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
                 if res.status == TaskStatus.SUCCESS and result is not None:
-                    print(result)
                     token = None
                     num_completion_tokens += 1
                     if request.return_token_log_probs:
diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py
@@ -3672,6 +3672,138 @@ def llm_model_endpoint_sync(
     return model_endpoint, model_endpoint_json
 
 
+@pytest.fixture
+def llm_model_endpoint_sync_tgi(
+    test_api_key: str, model_bundle_1: ModelBundle
+) -> Tuple[ModelEndpoint, Any]:
+    model_endpoint = ModelEndpoint(
+        record=ModelEndpointRecord(
+            id="test_llm_model_endpoint_id_2",
+            name="test_llm_model_endpoint_name_1",
+            created_by=test_api_key,
+            created_at=datetime(2022, 1, 3),
+            last_updated_at=datetime(2022, 1, 3),
+            metadata={
+                "_llm": {
+                    "model_name": "llama-7b",
+                    "source": "hugging_face",
+                    "inference_framework": "text_generation_inference",
+                    "inference_framework_image_tag": "123",
+                    "num_shards": 4,
+                }
+            },
+            creation_task_id="test_creation_task_id",
+            endpoint_type=ModelEndpointType.SYNC,
+            destination="test_destination",
+            status=ModelEndpointStatus.READY,
+            current_model_bundle=model_bundle_1,
+            owner=test_api_key,
+            public_inference=True,
+        ),
+        infra_state=ModelEndpointInfraState(
+            deployment_name=f"{test_api_key}-test_llm_model_endpoint_name_1",
+            aws_role="test_aws_role",
+            results_s3_bucket="test_s3_bucket",
+            child_fn_info=None,
+            labels={},
+            prewarm=True,
+            high_priority=False,
+            deployment_state=ModelEndpointDeploymentState(
+                min_workers=1,
+                max_workers=3,
+                per_worker=2,
+                available_workers=1,
+                unavailable_workers=1,
+            ),
+            resource_state=ModelEndpointResourceState(
+                cpus=1,
+                gpus=1,
+                memory="1G",
+                gpu_type=GpuType.NVIDIA_TESLA_T4,
+                storage="10G",
+                optimize_costs=True,
+            ),
+            user_config_state=ModelEndpointUserConfigState(
+                app_config=model_bundle_1.app_config,
+                endpoint_config=ModelEndpointConfig(
+                    bundle_name=model_bundle_1.name,
+                    endpoint_name="test_llm_model_endpoint_name_1",
+                    post_inference_hooks=["callback"],
+                    default_callback_url="http://www.example.com",
+                    default_callback_auth=CallbackAuth(
+                        __root__=CallbackBasicAuth(
+                            kind="basic",
+                            username="test_username",
+                            password="test_password",
+                        ),
+                    ),
+                ),
+            ),
+            num_queued_items=1,
+            image="test_image",
+        ),
+    )
+    model_endpoint_json: Dict[str, Any] = {
+        "id": "test_llm_model_endpoint_id_2",
+        "name": "test_llm_model_endpoint_name_1",
+        "model_name": "llama-7b",
+        "source": "hugging_face",
+        "status": "READY",
+        "inference_framework": "text_generation_inference",
+        "inference_framework_image_tag": "123",
+        "num_shards": 4,
+        "spec": {
+            "id": "test_llm_model_endpoint_id_2",
+            "name": "test_llm_model_endpoint_name_1",
+            "endpoint_type": "sync",
+            "destination": "test_destination",
+            "deployment_name": f"{test_api_key}-test_llm_model_endpoint_name_1",
+            "metadata": {
+                "_llm": {
+                    "model_name": "llama-7b",
+                    "source": "hugging_face",
+                    "inference_framework": "text_generation_inference",
+                    "inference_framework_image_tag": "123",
+                    "num_shards": 4,
+                }
+            },
+            "bundle_name": "test_model_bundle_name_1",
+            "status": "READY",
+            "post_inference_hooks": ["callback"],
+            "default_callback_url": "http://www.example.com",
+            "default_callback_auth": {
+                "kind": "basic",
+                "username": "test_username",
+                "password": "test_password",
+            },
+            "labels": {},
+            "aws_role": "test_aws_role",
+            "results_s3_bucket": "test_s3_bucket",
+            "created_by": test_api_key,
+            "created_at": "2022-01-03T00:00:00",
+            "last_updated_at": "2022-01-03T00:00:00",
+            "deployment_state": {
+                "min_workers": 1,
+                "max_workers": 3,
+                "per_worker": 2,
+                "available_workers": 1,
+                "unavailable_workers": 1,
+            },
+            "resource_state": {
+                "cpus": "1",
+                "gpus": 1,
+                "memory": "1G",
+                "gpu_type": "nvidia-tesla-t4",
+                "storage": "10G",
+                "optimize_costs": True,
+            },
+            "num_queued_items": 1,
+            "public_inference": True,
+        },
+    }
+    return model_endpoint, model_endpoint_json
+
+
 @pytest.fixture
 def llm_model_endpoint_streaming(test_api_key: str, model_bundle_5: ModelBundle) -> ModelEndpoint:
     # model_bundle_5 is a runnable bundle
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -22,6 +22,7 @@
     ObjectHasInvalidValueException,
     ObjectNotAuthorizedException,
     ObjectNotFoundException,
+    UpstreamServiceError,
 )
 from model_engine_server.domain.use_cases.llm_fine_tuning_use_cases import (
     MAX_LLM_ENDPOINTS_PER_INTERNAL_USER,
@@ -171,7 +172,8 @@ async def test_create_model_endpoint_text_generation_inference_use_case_success(
     )
     user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
     response_1 = await use_case.execute(
-        user=user, request=create_llm_model_endpoint_text_generation_inference_request_streaming
+        user=user,
+        request=create_llm_model_endpoint_text_generation_inference_request_streaming,
     )
     assert response_1.endpoint_creation_task_id
     assert isinstance(response_1, CreateLLMModelEndpointV1Response)
@@ -196,7 +198,8 @@ async def test_create_model_endpoint_text_generation_inference_use_case_success(
 
     with pytest.raises(ObjectHasInvalidValueException):
         await use_case.execute(
-            user=user, request=create_llm_model_endpoint_text_generation_inference_request_async
+            user=user,
+            request=create_llm_model_endpoint_text_generation_inference_request_async,
         )
 
 
@@ -483,6 +486,40 @@ async def test_completion_sync_use_case_predict_failed(
     assert response_1.output is None
 
 
+@pytest.mark.asyncio
+async def test_completion_sync_use_case_predict_failed_with_errors(
+    test_api_key: str,
+    fake_model_endpoint_service,
+    fake_llm_model_endpoint_service,
+    llm_model_endpoint_sync_tgi: Tuple[ModelEndpoint, Any],
+    completion_sync_request: CompletionSyncV1Request,
+):
+    fake_llm_model_endpoint_service.add_model_endpoint(llm_model_endpoint_sync_tgi[0])
+    fake_model_endpoint_service.sync_model_endpoint_inference_gateway.response = SyncEndpointPredictV1Response(
+        status=TaskStatus.SUCCESS,
+        result={
+            "result": """
+  {
+    "error": "Request failed during generation: Server error: transport error",
+    "error_type": "generation"
+  }
+"""
+        },
+        traceback="failed to predict",
+    )
+    use_case = CompletionSyncV1UseCase(
+        model_endpoint_service=fake_model_endpoint_service,
+        llm_model_endpoint_service=fake_llm_model_endpoint_service,
+    )
+    user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
+    with pytest.raises(UpstreamServiceError):
+        await use_case.execute(
+            user=user,
+            model_endpoint_name=llm_model_endpoint_sync_tgi[0].record.name,
+            request=completion_sync_request,
+        )
+
+
 @pytest.mark.asyncio
 async def test_completion_sync_use_case_not_sync_endpoint_raises(
     test_api_key: str,
@@ -964,7 +1001,13 @@ async def test_delete_public_inference_model_raises_not_authorized(
 
 @pytest.mark.asyncio
 async def test_exclude_safetensors_or_bin_majority_bin_returns_exclude_safetensors():
-    fake_model_files = ["fake.bin", "fake2.bin", "fake3.safetensors", "model.json", "optimizer.pt"]
+    fake_model_files = [
+        "fake.bin",
+        "fake2.bin",
+        "fake3.safetensors",
+        "model.json",
+        "optimizer.pt",
+    ]
     assert _exclude_safetensors_or_bin(fake_model_files) == "*.safetensors"