scaleapi
diff --git a/‎.black.toml‎
Lines changed: 1 addition & 1 deletion b/‎.black.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎model-engine/model_engine_server/api/llms_v1.py‎
Lines changed: 5 additions & 2 deletions b/‎model-engine/model_engine_server/api/llms_v1.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 1 addition & 0 deletions b/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎model-engine/model_engine_server/common/dtos/llms/model_endpoints.py‎
Lines changed: 132 additions & 54 deletions b/‎model-engine/model_engine_server/common/dtos/llms/model_endpoints.py‎
Lines changed: 132 additions & 54 deletions
@@ -1,7 +1,7 @@
 [tool.black]
 # Independently enforced in .flake8 and .isort.cfg
 line-length = 100
-target-version = ['py38']
+target-version = ['py310']
 include = '\.pyi?$'
 exclude = '''
 (
 
@@ -87,6 +87,7 @@
     UpdateLLMModelEndpointV1UseCase,
 )
 from model_engine_server.domain.use_cases.model_bundle_use_cases import CreateModelBundleV2UseCase
+from pydantic import RootModel
 from sse_starlette.sse import EventSourceResponse
 
 
@@ -147,10 +148,11 @@ def handle_streaming_exception(
 
 @llm_router_v1.post("/model-endpoints", response_model=CreateLLMModelEndpointV1Response)
 async def create_model_endpoint(
-    request: CreateLLMModelEndpointV1Request,
+    wrapped_request: RootModel[CreateLLMModelEndpointV1Request],
     auth: User = Depends(verify_authentication),
     external_interfaces: ExternalInterfaces = Depends(get_external_interfaces),
 ) -> CreateLLMModelEndpointV1Response:
+    request = wrapped_request.root
     """
     Creates an LLM endpoint for the current user.
     """
@@ -261,13 +263,14 @@ async def get_model_endpoint(
 )
 async def update_model_endpoint(
     model_endpoint_name: str,
-    request: UpdateLLMModelEndpointV1Request,
+    wrapped_request: RootModel[UpdateLLMModelEndpointV1Request],
     auth: User = Depends(verify_authentication),
     external_interfaces: ExternalInterfaces = Depends(get_external_interfaces),
 ) -> UpdateLLMModelEndpointV1Response:
     """
     Updates an LLM endpoint for the current user.
     """
+    request = wrapped_request.root
     logger.info(f"PUT /llm/model-endpoints/{model_endpoint_name} with {request} for {auth}")
     try:
         create_model_bundle_use_case = CreateModelBundleV2UseCase(
 
@@ -76,6 +76,7 @@ class HostedModelInferenceServiceConfig:
     cache_redis_aws_secret_name: Optional[str] = (
         None  # Not an env var because the redis cache info is already here
     )
+    sglang_repository: Optional[str] = None
 
     @classmethod
     def from_json(cls, json):
 
@@ -3,9 +3,10 @@
 
 """
 
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Literal, Optional, TypeAlias, Union
 
 from model_engine_server.common.dtos.core import HttpUrlStr
+from model_engine_server.common.dtos.llms.sglang import SGLangEndpointAdditionalArgs
 from model_engine_server.common.dtos.llms.vllm import VLLMEndpointAdditionalArgs
 from model_engine_server.common.dtos.model_endpoints import (
     CpuSpecificationType,
@@ -25,21 +26,11 @@
     ModelEndpointStatus,
     Quantization,
 )
+from pydantic import Discriminator, Tag
+from typing_extensions import Annotated
 
 
-class CreateLLMModelEndpointV1Request(VLLMEndpointAdditionalArgs, BaseModel):
-    name: str
-
-    # LLM specific fields
-    model_name: str
-    source: LLMSource = LLMSource.HUGGING_FACE
-    inference_framework: LLMInferenceFramework = LLMInferenceFramework.VLLM
-    inference_framework_image_tag: str = "latest"
-    num_shards: int = 1
-    """
-    Number of shards to distribute the model onto GPUs.
-    """
-
+class LLMModelEndpointCommonArgs(BaseModel):
     quantize: Optional[Quantization] = None
     """
     Whether to quantize the model.
@@ -51,20 +42,14 @@ class CreateLLMModelEndpointV1Request(VLLMEndpointAdditionalArgs, BaseModel):
     """
 
     # General endpoint fields
-    metadata: Dict[str, Any]  # TODO: JSON type
     post_inference_hooks: Optional[List[str]] = None
-    endpoint_type: ModelEndpointType = ModelEndpointType.SYNC
     cpus: Optional[CpuSpecificationType] = None
     gpus: Optional[int] = None
     memory: Optional[StorageSpecificationType] = None
     gpu_type: Optional[GpuType] = None
     storage: Optional[StorageSpecificationType] = None
     nodes_per_worker: Optional[int] = None
     optimize_costs: Optional[bool] = None
-    min_workers: int
-    max_workers: int
-    per_worker: int
-    labels: Dict[str, str]
     prewarm: Optional[bool] = None
     high_priority: Optional[bool] = None
     billing_tags: Optional[Dict[str, Any]] = None
@@ -77,6 +62,83 @@ class CreateLLMModelEndpointV1Request(VLLMEndpointAdditionalArgs, BaseModel):
     )
 
 
+class CreateLLMModelEndpointArgs(LLMModelEndpointCommonArgs):
+    name: str
+    model_name: str
+    """
+    Number of shards to distribute the model onto GPUs.
+    """
+    metadata: Dict[str, Any]  # TODO: JSON type
+    min_workers: int
+    max_workers: int
+    per_worker: int
+    labels: Dict[str, str]
+    source: LLMSource = LLMSource.HUGGING_FACE
+    inference_framework_image_tag: str = "latest"
+    num_shards: int = 1
+    endpoint_type: ModelEndpointType = ModelEndpointType.SYNC
+
+
+class CreateVLLMModelEndpointRequest(
+    VLLMEndpointAdditionalArgs, CreateLLMModelEndpointArgs, BaseModel
+):
+    inference_framework: Literal[LLMInferenceFramework.VLLM] = LLMInferenceFramework.VLLM
+    pass
+
+
+class CreateSGLangModelEndpointRequest(
+    SGLangEndpointAdditionalArgs, CreateLLMModelEndpointArgs, BaseModel
+):
+    inference_framework: Literal[LLMInferenceFramework.SGLANG] = LLMInferenceFramework.SGLANG
+    pass
+
+
+class CreateDeepSpeedModelEndpointRequest(CreateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.DEEPSPEED] = LLMInferenceFramework.DEEPSPEED
+    pass
+
+
+class CreateTextGenerationInferenceModelEndpointRequest(CreateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.TEXT_GENERATION_INFERENCE] = (
+        LLMInferenceFramework.TEXT_GENERATION_INFERENCE
+    )
+    pass
+
+
+class CreateLightLLMModelEndpointRequest(CreateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.LIGHTLLM] = LLMInferenceFramework.LIGHTLLM
+    pass
+
+
+class CreateTensorRTLLMModelEndpointRequest(CreateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.TENSORRT_LLM] = (
+        LLMInferenceFramework.TENSORRT_LLM
+    )
+    pass
+
+
+def get_inference_framework(v: Any) -> str:
+    if isinstance(v, dict):
+        return v.get("inference_framework", LLMInferenceFramework.VLLM)
+    return getattr(v, "inference_framework", LLMInferenceFramework.VLLM)
+
+
+CreateLLMModelEndpointV1Request: TypeAlias = Annotated[
+    Union[
+        Annotated[CreateVLLMModelEndpointRequest, Tag(LLMInferenceFramework.VLLM)],
+        Annotated[CreateSGLangModelEndpointRequest, Tag(LLMInferenceFramework.SGLANG)],
+        Annotated[CreateDeepSpeedModelEndpointRequest, Tag(LLMInferenceFramework.DEEPSPEED)],
+        Annotated[
+            CreateTextGenerationInferenceModelEndpointRequest,
+            Tag(LLMInferenceFramework.TEXT_GENERATION_INFERENCE),
+        ],
+        Annotated[CreateLightLLMModelEndpointRequest, Tag(LLMInferenceFramework.LIGHTLLM)],
+        Annotated[CreateTensorRTLLMModelEndpointRequest, Tag(LLMInferenceFramework.TENSORRT_LLM)],
+    ],
+    Discriminator(get_inference_framework),
+]
+
+
 class CreateLLMModelEndpointV1Response(BaseModel):
     endpoint_creation_task_id: str
 
@@ -107,57 +169,73 @@ class ListLLMModelEndpointsV1Response(BaseModel):
     model_endpoints: List[GetLLMModelEndpointV1Response]
 
 
-class UpdateLLMModelEndpointV1Request(VLLMEndpointAdditionalArgs, BaseModel):
-    # LLM specific fields
+class UpdateLLMModelEndpointArgs(LLMModelEndpointCommonArgs):
     model_name: Optional[str] = None
     source: Optional[LLMSource] = None
+    inference_framework: Optional[LLMInferenceFramework] = None
     inference_framework_image_tag: Optional[str] = None
     num_shards: Optional[int] = None
     """
     Number of shards to distribute the model onto GPUs.
     """
-
-    quantize: Optional[Quantization] = None
-    """
-    Whether to quantize the model.
+    metadata: Optional[Dict[str, Any]] = None
+    force_bundle_recreation: Optional[bool] = False
     """
+    Whether to force recreate the underlying bundle.
 
-    checkpoint_path: Optional[str] = None
-    """
-    Path to the checkpoint to load the model from.
+    If True, the underlying bundle will be recreated. This is useful if there are underlying implementation changes with how bundles are created
+    that we would like to pick up for existing endpoints
     """
-
-    # General endpoint fields
-    metadata: Optional[Dict[str, Any]] = None
-    post_inference_hooks: Optional[List[str]] = None
-    cpus: Optional[CpuSpecificationType] = None
-    gpus: Optional[int] = None
-    memory: Optional[StorageSpecificationType] = None
-    gpu_type: Optional[GpuType] = None
-    storage: Optional[StorageSpecificationType] = None
-    optimize_costs: Optional[bool] = None
     min_workers: Optional[int] = None
     max_workers: Optional[int] = None
     per_worker: Optional[int] = None
     labels: Optional[Dict[str, str]] = None
-    prewarm: Optional[bool] = None
-    high_priority: Optional[bool] = None
-    billing_tags: Optional[Dict[str, Any]] = None
-    default_callback_url: Optional[HttpUrlStr] = None
-    default_callback_auth: Optional[CallbackAuth] = None
-    public_inference: Optional[bool] = None
-    chat_template_override: Optional[str] = Field(
-        default=None,
-        description="A Jinja template to use for this endpoint. If not provided, will use the chat template from the checkpoint",
+
+
+class UpdateVLLMModelEndpointRequest(
+    VLLMEndpointAdditionalArgs, UpdateLLMModelEndpointArgs, BaseModel
+):
+    inference_framework: Literal[LLMInferenceFramework.VLLM] = LLMInferenceFramework.VLLM
+
+
+class UpdateSGLangModelEndpointRequest(
+    SGLangEndpointAdditionalArgs, UpdateLLMModelEndpointArgs, BaseModel
+):
+    inference_framework: Literal[LLMInferenceFramework.SGLANG] = LLMInferenceFramework.SGLANG
+
+
+class UpdateDeepSpeedModelEndpointRequest(UpdateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.DEEPSPEED] = LLMInferenceFramework.DEEPSPEED
+
+
+class UpdateTextGenerationInferenceModelEndpointRequest(UpdateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.TEXT_GENERATION_INFERENCE] = (
+        LLMInferenceFramework.TEXT_GENERATION_INFERENCE
     )
 
-    force_bundle_recreation: Optional[bool] = False
-    """
-    Whether to force recreate the underlying bundle.
 
-    If True, the underlying bundle will be recreated. This is useful if there are underlying implementation changes with how bundles are created
-    that we would like to pick up for existing endpoints
-    """
+class UpdateLightLLMModelEndpointRequest(UpdateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.LIGHTLLM] = LLMInferenceFramework.LIGHTLLM
+
+
+class UpdateTensorRTLLMModelEndpointRequest(UpdateLLMModelEndpointArgs, BaseModel):
+    inference_framework: Literal[LLMInferenceFramework.TENSORRT_LLM] = (
+        LLMInferenceFramework.TENSORRT_LLM
+    )
+
+
+UpdateLLMModelEndpointV1Request: TypeAlias = Annotated[
+    Union[
+        Annotated[UpdateVLLMModelEndpointRequest, Tag(LLMInferenceFramework.VLLM)],
+        Annotated[UpdateSGLangModelEndpointRequest, Tag(LLMInferenceFramework.SGLANG)],
+        Annotated[UpdateDeepSpeedModelEndpointRequest, Tag(LLMInferenceFramework.DEEPSPEED)],
+        Annotated[
+            UpdateTextGenerationInferenceModelEndpointRequest,
+            Tag(LLMInferenceFramework.TEXT_GENERATION_INFERENCE),
+        ],
+    ],
+    Discriminator(get_inference_framework),
+]
 
 
 class UpdateLLMModelEndpointV1Response(BaseModel):
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`[tool.black]`
`2`	`2`	`# Independently enforced in .flake8 and .isort.cfg`
`3`	`3`	`line-length = 100`
`4`		`-target-version = ['py38']`
	`4`	`+target-version = ['py310']`
`5`	`5`	`include = '\.pyi?$'`
`6`	`6`	`exclude = '''`
`7`	`7`	`(`
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,7 @@ class HostedModelInferenceServiceConfig:`
`76`	`76`	`cache_redis_aws_secret_name: Optional[str] = (`
`77`	`77`	`None # Not an env var because the redis cache info is already here`
`78`	`78`	`)`
	`79`	`+ sglang_repository: Optional[str] = None`
`79`	`80`
`80`	`81`	`@classmethod`
`81`	`82`	`def from_json(cls, json):`