scaleapi
diff --git a/‎charts/model-engine/templates/service_template_config_map.yaml‎
Lines changed: 3 additions & 0 deletions b/‎charts/model-engine/templates/service_template_config_map.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎charts/model-engine/values_circleci.yaml‎
Lines changed: 1 addition & 0 deletions b/‎charts/model-engine/values_circleci.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎charts/model-engine/values_sample.yaml‎
Lines changed: 1 addition & 0 deletions b/‎charts/model-engine/values_sample.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎model-engine/model_engine_server/api/llms_v1.py‎
Lines changed: 26 additions & 0 deletions b/‎model-engine/model_engine_server/api/llms_v1.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 1 addition & 0 deletions b/‎model-engine/model_engine_server/common/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎model-engine/model_engine_server/common/dtos/llms.py‎
Lines changed: 110 additions & 9 deletions b/‎model-engine/model_engine_server/common/dtos/llms.py‎
Lines changed: 110 additions & 9 deletions
diff --git a/‎model-engine/model_engine_server/domain/entities/batch_job_entity.py‎
Lines changed: 1 addition & 0 deletions b/‎model-engine/model_engine_server/domain/entities/batch_job_entity.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎model-engine/model_engine_server/domain/gateways/docker_image_batch_job_gateway.py‎
Lines changed: 3 additions & 0 deletions b/‎model-engine/model_engine_server/domain/gateways/docker_image_batch_job_gateway.py‎
Lines changed: 3 additions & 0 deletions
@@ -615,6 +615,9 @@ data:
       backoffLimit: 0
       activeDeadlineSeconds: ${BATCH_JOB_MAX_RUNTIME}
       ttlSecondsAfterFinished: ${BATCH_JOB_TTL_SECONDS_AFTER_FINISHED}
+      completions: ${BATCH_JOB_NUM_WORKERS}
+      parallelism: ${BATCH_JOB_NUM_WORKERS}
+      completionMode: "Indexed"
       template:
         metadata:
           labels:
 
@@ -151,6 +151,7 @@ config:
       vllm_repository: "vllm"
       lightllm_repository: "lightllm"
       tensorrt_llm_repository: "tensorrt-llm"
+      batch_inference_vllm_repository: "llm-engine/batch-infer-vllm"
       user_inference_base_repository: "launch/inference"
       user_inference_pytorch_repository: "hosted-model-inference/async-pytorch"
       user_inference_tensorflow_repository: "hosted-model-inference/async-tensorflow-cpu"
 
@@ -207,6 +207,7 @@ config:
       vllm_repository: "vllm"
       lightllm_repository: "lightllm"
       tensorrt_llm_repository: "tensorrt-llm"
+      batch_inference_vllm_repository: "llm-engine/batch-infer-vllm"
       user_inference_base_repository: "launch/inference"
       user_inference_pytorch_repository: "launch/inference/pytorch"
       user_inference_tensorflow_repository: "launch/inference/tf"
 
@@ -19,6 +19,8 @@
     CompletionStreamV1Response,
     CompletionSyncV1Request,
     CompletionSyncV1Response,
+    CreateBatchCompletionsRequest,
+    CreateBatchCompletionsResponse,
     CreateFineTuneRequest,
     CreateFineTuneResponse,
     CreateLLMModelEndpointV1Request,
@@ -73,6 +75,7 @@
 from model_engine_server.domain.use_cases.llm_model_endpoint_use_cases import (
     CompletionStreamV1UseCase,
     CompletionSyncV1UseCase,
+    CreateBatchCompletionsUseCase,
     CreateLLMModelBundleV1UseCase,
     CreateLLMModelEndpointV1UseCase,
     DeleteLLMEndpointByNameUseCase,
@@ -568,3 +571,26 @@ async def delete_llm_model_endpoint(
             status_code=500,
             detail="deletion of endpoint failed.",
         ) from exc
+
+
+@llm_router_v1.post("/batch-completions", response_model=CreateBatchCompletionsResponse)
+async def create_batch_completions(
+    request: CreateBatchCompletionsRequest,
+    auth: User = Depends(verify_authentication),
+    external_interfaces: ExternalInterfaces = Depends(get_external_interfaces),
+) -> CreateBatchCompletionsResponse:
+    logger.info(f"POST /batch-completions with {request} for {auth}")
+    try:
+        use_case = CreateBatchCompletionsUseCase(
+            docker_image_batch_job_gateway=external_interfaces.docker_image_batch_job_gateway,
+            docker_repository=external_interfaces.docker_repository,
+            docker_image_batch_job_bundle_repo=external_interfaces.docker_image_batch_job_bundle_repository,
+        )
+        return await use_case.execute(user=auth, request=request)
+    except (ObjectNotFoundException, ObjectNotAuthorizedException) as exc:
+        raise HTTPException(
+            status_code=404,
+            detail="The specified endpoint could not be found.",
+        ) from exc
+    except (InvalidRequestException, ObjectHasInvalidValueException) as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
@@ -58,6 +58,7 @@ class HostedModelInferenceServiceConfig:
     vllm_repository: str
     lightllm_repository: str
     tensorrt_llm_repository: str
+    batch_inference_vllm_repository: str
     user_inference_base_repository: str
     user_inference_pytorch_repository: str
     user_inference_tensorflow_repository: str
 
@@ -30,21 +30,21 @@ class CreateLLMModelEndpointV1Request(BaseModel):
     # LLM specific fields
     model_name: str
     source: LLMSource = LLMSource.HUGGING_FACE
-    inference_framework: LLMInferenceFramework = LLMInferenceFramework.DEEPSPEED
-    inference_framework_image_tag: str
+    inference_framework: LLMInferenceFramework = LLMInferenceFramework.VLLM
+    inference_framework_image_tag: str = "latest"
     num_shards: int = 1
     """
-    Number of shards to distribute the model onto GPUs. Only affects behavior for text-generation-inference models
+    Number of shards to distribute the model onto GPUs.
     """
 
     quantize: Optional[Quantization] = None
     """
-    Whether to quantize the model. Only affect behavior for text-generation-inference models
+    Whether to quantize the model.
     """
 
     checkpoint_path: Optional[str] = None
     """
-    Path to the checkpoint to load the model from. Only affects behavior for text-generation-inference models
+    Path to the checkpoint to load the model from.
     """
 
     # General endpoint fields
@@ -102,17 +102,17 @@ class UpdateLLMModelEndpointV1Request(BaseModel):
     inference_framework_image_tag: Optional[str]
     num_shards: Optional[int]
     """
-    Number of shards to distribute the model onto GPUs. Only affects behavior for text-generation-inference models
+    Number of shards to distribute the model onto GPUs.
     """
 
     quantize: Optional[Quantization]
     """
-    Whether to quantize the model. Only affect behavior for text-generation-inference models
+    Whether to quantize the model.
     """
 
     checkpoint_path: Optional[str]
     """
-    Path to the checkpoint to load the model from. Only affects behavior for text-generation-inference models
+    Path to the checkpoint to load the model from.
     """
 
     # General endpoint fields
@@ -220,7 +220,7 @@ class CompletionStreamV1Request(BaseModel):
     """
     return_token_log_probs: Optional[bool] = False
     """
-    Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
+    Whether to return the log probabilities of the tokens.
     """
     presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     """
@@ -359,3 +359,104 @@ class ModelDownloadResponse(BaseModel):
 
 class DeleteLLMEndpointResponse(BaseModel):
     deleted: bool
+
+
+class CreateBatchCompletionsRequestContent(BaseModel):
+    prompts: List[str]
+    max_new_tokens: int
+    temperature: float = Field(ge=0.0, le=1.0)
+    """
+    Temperature of the sampling. Setting to 0 equals to greedy sampling.
+    """
+    stop_sequences: Optional[List[str]] = None
+    """
+    List of sequences to stop the completion at.
+    """
+    return_token_log_probs: Optional[bool] = False
+    """
+    Whether to return the log probabilities of the tokens.
+    """
+    presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    """
+    Only supported in vllm, lightllm
+    Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
+    """
+    frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    """
+    Only supported in vllm, lightllm
+    Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
+    """
+    top_k: Optional[int] = Field(default=None, ge=-1)
+    """
+    Controls the number of top tokens to consider. -1 means consider all tokens.
+    """
+    top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
+    """
+    Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
+    """
+
+
+class CreateBatchCompletionsModelConfig(BaseModel):
+    model: str
+    checkpoint_path: Optional[str] = None
+    """
+    Path to the checkpoint to load the model from.
+    """
+    labels: Dict[str, str]
+    """
+    Labels to attach to the batch inference job.
+    """
+    num_shards: Optional[int] = 1
+    """
+    Suggested number of shards to distribute the model. When not specified, will infer the number of shards based on model config.
+    System may decide to use a different number than the given value.
+    """
+    quantize: Optional[Quantization] = None
+    """
+    Whether to quantize the model.
+    """
+    seed: Optional[int] = None
+    """
+    Random seed for the model.
+    """
+
+
+class CreateBatchCompletionsRequest(BaseModel):
+    """
+    Request object for batch completions.
+    """
+
+    input_data_path: Optional[str]
+    output_data_path: str
+    """
+    Path to the output file. The output file will be a JSON file of type List[CompletionOutput].
+    """
+    content: Optional[CreateBatchCompletionsRequestContent] = None
+    """
+    Either `input_data_path` or `content` needs to be provided.
+    When input_data_path is provided, the input file should be a JSON file of type BatchCompletionsRequestContent.
+    """
+    model_config: CreateBatchCompletionsModelConfig
+    """
+    Model configuration for the batch inference. Hardware configurations are inferred.
+    """
+    data_parallelism: Optional[int] = Field(default=1, ge=1, le=64)
+    """
+    Number of replicas to run the batch inference. More replicas are slower to schedule but faster to inference.
+    """
+    max_runtime_sec: Optional[int] = Field(default=24 * 3600, ge=1, le=2 * 24 * 3600)
+    """
+    Maximum runtime of the batch inference in seconds. Default to one day.
+    """
+
+
+class CreateBatchCompletionsResponse(BaseModel):
+    job_id: str
+
+
+class GetBatchCompletionsResponse(BaseModel):
+    progress: float
+    """
+    Progress of the batch inference in percentage from 0 to 100.
+    """
+    finished: bool
@@ -61,3 +61,4 @@ class DockerImageBatchJob(BaseModel):
     status: BatchJobStatus  # the status map relatively nicely onto BatchJobStatus
     annotations: Optional[Dict[str, str]] = None
     override_job_max_runtime_s: Optional[int] = None
+    num_workers: Optional[int] = 1
@@ -26,6 +26,7 @@ async def create_docker_image_batch_job(
         mount_location: Optional[str],
         annotations: Optional[Dict[str, str]] = None,
         override_job_max_runtime_s: Optional[int] = None,
+        num_workers: Optional[int] = 1,
     ) -> str:
         """
         Create a docker image batch job
@@ -42,6 +43,8 @@ async def create_docker_image_batch_job(
             annotations: K8s annotations
             resource_requests: The resource requests for the batch job.
             mount_location: Location on filesystem where runtime-provided file contents get mounted
+            override_job_max_runtime_s: Optional override for the maximum runtime of the job
+            num_workers: num of pods to run in this job. Coordination needs to happen between the workers.
 
 
         Returns: