scaleapi
diff --git a/‎model-engine/model_engine_server/api/dependencies.py‎
Lines changed: 5 additions & 0 deletions b/‎model-engine/model_engine_server/api/dependencies.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/api/tasks_v1.py‎
Lines changed: 3 additions & 2 deletions b/‎model-engine/model_engine_server/api/tasks_v1.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 1 addition & 1 deletion b/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎model-engine/model_engine_server/common/dtos/tasks.py‎
Lines changed: 8 additions & 1 deletion b/‎model-engine/model_engine_server/common/dtos/tasks.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎model-engine/model_engine_server/domain/exceptions.py‎
Lines changed: 7 additions & 0 deletions b/‎model-engine/model_engine_server/domain/exceptions.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/domain/gateways/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎model-engine/model_engine_server/domain/gateways/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/domain/gateways/inference_autoscaling_metrics_gateway.py‎
Lines changed: 22 additions & 0 deletions b/‎model-engine/model_engine_server/domain/gateways/inference_autoscaling_metrics_gateway.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/domain/gateways/streaming_model_endpoint_inference_gateway.py‎
Lines changed: 2 additions & 2 deletions b/‎model-engine/model_engine_server/domain/gateways/streaming_model_endpoint_inference_gateway.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎model-engine/model_engine_server/domain/gateways/sync_model_endpoint_inference_gateway.py‎
Lines changed: 2 additions & 2 deletions b/‎model-engine/model_engine_server/domain/gateways/sync_model_endpoint_inference_gateway.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎model-engine/model_engine_server/domain/services/model_endpoint_service.py‎
Lines changed: 11 additions & 0 deletions b/‎model-engine/model_engine_server/domain/services/model_endpoint_service.py‎
Lines changed: 11 additions & 0 deletions
@@ -49,6 +49,7 @@
     LiveStreamingModelEndpointInferenceGateway,
     LiveSyncModelEndpointInferenceGateway,
     ModelEndpointInfraGateway,
+    RedisInferenceAutoscalingMetricsGateway,
     S3FilesystemGateway,
     S3LLMArtifactGateway,
 )
@@ -179,6 +180,9 @@ def _get_external_interfaces(
     model_endpoints_schema_gateway = LiveModelEndpointsSchemaGateway(
         filesystem_gateway=filesystem_gateway
     )
+    inference_autoscaling_metrics_gateway = RedisInferenceAutoscalingMetricsGateway(
+        redis_client=redis_client
+    )  # we can just reuse the existing redis client, we shouldn't get key collisions because of the prefix
     model_endpoint_service = LiveModelEndpointService(
         model_endpoint_record_repository=model_endpoint_record_repo,
         model_endpoint_infra_gateway=model_endpoint_infra_gateway,
@@ -187,6 +191,7 @@ def _get_external_interfaces(
         streaming_model_endpoint_inference_gateway=streaming_model_endpoint_inference_gateway,
         sync_model_endpoint_inference_gateway=sync_model_endpoint_inference_gateway,
         model_endpoints_schema_gateway=model_endpoints_schema_gateway,
+        inference_autoscaling_metrics_gateway=inference_autoscaling_metrics_gateway,
     )
     llm_model_endpoint_service = LiveLLMModelEndpointService(
         model_endpoint_record_repository=model_endpoint_record_repo,
 
@@ -11,6 +11,7 @@
     CreateAsyncTaskV1Response,
     EndpointPredictV1Request,
     GetAsyncTaskV1Response,
+    SyncEndpointPredictV1Request,
     SyncEndpointPredictV1Response,
     TaskStatus,
 )
@@ -97,7 +98,7 @@ def get_async_inference_task(
 @inference_task_router_v1.post("/sync-tasks", response_model=SyncEndpointPredictV1Response)
 async def create_sync_inference_task(
     model_endpoint_id: str,
-    request: EndpointPredictV1Request,
+    request: SyncEndpointPredictV1Request,
     auth: User = Depends(verify_authentication),
     external_interfaces: ExternalInterfaces = Depends(get_external_interfaces_read_only),
 ) -> SyncEndpointPredictV1Response:
@@ -137,7 +138,7 @@ async def create_sync_inference_task(
 @inference_task_router_v1.post("/streaming-tasks")
 async def create_streaming_inference_task(
     model_endpoint_id: str,
-    request: EndpointPredictV1Request,
+    request: SyncEndpointPredictV1Request,
     auth: User = Depends(verify_authentication),
     external_interfaces: ExternalInterfaces = Depends(get_external_interfaces_read_only),
 ) -> EventSourceResponse:
 
@@ -45,7 +45,7 @@ def get_model_cache_directory_name(model_name: str):
 class HostedModelInferenceServiceConfig:
     endpoint_namespace: str
     billing_queue_arn: str
-    cache_redis_url: str
+    cache_redis_url: str  # also using this to store sync autoscaling metrics
     sqs_profile: str
     sqs_queue_policy_template: str
     sqs_queue_tag_template: str
 
@@ -6,7 +6,7 @@
 from typing import Any, Optional
 
 from model_engine_server.domain.entities import CallbackAuth
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 
 class ResponseSchema(BaseModel):
@@ -49,3 +49,10 @@ class EndpointPredictV1Request(BaseModel):
     callback_url: Optional[str] = None
     callback_auth: Optional[CallbackAuth] = None
     return_pickled: bool = False
+
+
+class SyncEndpointPredictV1Request(EndpointPredictV1Request):
+    timeout_seconds: Optional[float] = Field(default=None, gt=0)
+    num_retries: Optional[int] = Field(default=None, ge=0)
+    # See live_{sync,streaming}_model_endpoint_inference_gateway to see how timeout_seconds/num_retries interact.
+    # Also these fields are only relevant for sync endpoints
@@ -59,6 +59,13 @@ class TooManyRequestsException(DomainException):
     """
 
 
+class NoHealthyUpstreamException(DomainException):
+    """
+    Thrown if an endpoint returns a 503 exception for no healthy upstream. This can happen if there are zero pods
+    available to serve the request.
+    """
+
+
 class CorruptRecordInfraStateException(DomainException):
     """
     Thrown if the data from existing state (i.e. the db, k8s, etc.) is somehow uninterpretable
 
@@ -2,6 +2,7 @@
 from .cron_job_gateway import CronJobGateway
 from .docker_image_batch_job_gateway import DockerImageBatchJobGateway
 from .file_storage_gateway import FileStorageGateway
+from .inference_autoscaling_metrics_gateway import InferenceAutoscalingMetricsGateway
 from .llm_artifact_gateway import LLMArtifactGateway
 from .model_endpoints_schema_gateway import ModelEndpointsSchemaGateway
 from .model_primitive_gateway import ModelPrimitiveGateway
@@ -15,6 +16,7 @@
     "CronJobGateway",
     "DockerImageBatchJobGateway",
     "FileStorageGateway",
+    "InferenceAutoscalingMetricsGateway",
     "LLMArtifactGateway",
     "ModelEndpointsSchemaGateway",
     "ModelPrimitiveGateway",
 
@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+
+
+class InferenceAutoscalingMetricsGateway(ABC):
+    """
+    Abstract Base Class for a gateway that emits autoscaling metrics for inference requests. Can be used in conjunction
+    with various autoscaler resources, e.g. a Keda ScaledObject, to autoscale inference endpoints.
+    """
+
+    @abstractmethod
+    async def emit_inference_autoscaling_metric(self, endpoint_id: str):
+        """
+        On an inference request, emit a metric
+        """
+        pass
+
+    @abstractmethod
+    async def emit_prewarm_metric(self, endpoint_id: str):
+        """
+        If you want to prewarm an endpoint, emit a metric here
+        """
+        pass
@@ -2,7 +2,7 @@
 from typing import AsyncIterable
 
 from model_engine_server.common.dtos.tasks import (
-    EndpointPredictV1Request,
+    SyncEndpointPredictV1Request,
     SyncEndpointPredictV1Response,
 )
 
@@ -17,7 +17,7 @@ class StreamingModelEndpointInferenceGateway(ABC):
 
     @abstractmethod
     def streaming_predict(
-        self, topic: str, predict_request: EndpointPredictV1Request
+        self, topic: str, predict_request: SyncEndpointPredictV1Request
     ) -> AsyncIterable[SyncEndpointPredictV1Response]:
         """
         Runs a prediction request and returns a streaming response.
 
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 
 from model_engine_server.common.dtos.tasks import (
-    EndpointPredictV1Request,
+    SyncEndpointPredictV1Request,
     SyncEndpointPredictV1Response,
 )
 
@@ -16,7 +16,7 @@ class SyncModelEndpointInferenceGateway(ABC):
 
     @abstractmethod
     async def predict(
-        self, topic: str, predict_request: EndpointPredictV1Request
+        self, topic: str, predict_request: SyncEndpointPredictV1Request
     ) -> SyncEndpointPredictV1Response:
         """
         Runs a prediction request and returns a response.
 
@@ -18,6 +18,9 @@
     StreamingModelEndpointInferenceGateway,
     SyncModelEndpointInferenceGateway,
 )
+from model_engine_server.domain.gateways.inference_autoscaling_metrics_gateway import (
+    InferenceAutoscalingMetricsGateway,
+)
 
 
 class ModelEndpointService(ABC):
@@ -49,6 +52,14 @@ def get_streaming_model_endpoint_inference_gateway(
         Returns the sync model endpoint inference gateway.
         """
 
+    @abstractmethod
+    def get_inference_auto_scaling_metrics_gateway(
+        self,
+    ) -> InferenceAutoscalingMetricsGateway:
+        """
+        Returns the inference autoscaling metrics gateway.
+        """
+
     @abstractmethod
     async def create_model_endpoint(
         self,