@@ -30,21 +30,21 @@ class CreateLLMModelEndpointV1Request(BaseModel):
3030 # LLM specific fields
3131 model_name : str
3232 source : LLMSource = LLMSource .HUGGING_FACE
33- inference_framework : LLMInferenceFramework = LLMInferenceFramework .DEEPSPEED
34- inference_framework_image_tag : str
33+ inference_framework : LLMInferenceFramework = LLMInferenceFramework .VLLM
34+ inference_framework_image_tag : str = "latest"
3535 num_shards : int = 1
3636 """
37- Number of shards to distribute the model onto GPUs. Only affects behavior for text-generation-inference models
37+ Number of shards to distribute the model onto GPUs.
3838 """
3939
4040 quantize : Optional [Quantization ] = None
4141 """
42- Whether to quantize the model. Only affect behavior for text-generation-inference models
42+ Whether to quantize the model.
4343 """
4444
4545 checkpoint_path : Optional [str ] = None
4646 """
47- Path to the checkpoint to load the model from. Only affects behavior for text-generation-inference models
47+ Path to the checkpoint to load the model from.
4848 """
4949
5050 # General endpoint fields
@@ -102,17 +102,17 @@ class UpdateLLMModelEndpointV1Request(BaseModel):
102102 inference_framework_image_tag : Optional [str ]
103103 num_shards : Optional [int ]
104104 """
105- Number of shards to distribute the model onto GPUs. Only affects behavior for text-generation-inference models
105+ Number of shards to distribute the model onto GPUs.
106106 """
107107
108108 quantize : Optional [Quantization ]
109109 """
110- Whether to quantize the model. Only affect behavior for text-generation-inference models
110+ Whether to quantize the model.
111111 """
112112
113113 checkpoint_path : Optional [str ]
114114 """
115- Path to the checkpoint to load the model from. Only affects behavior for text-generation-inference models
115+ Path to the checkpoint to load the model from.
116116 """
117117
118118 # General endpoint fields
@@ -220,7 +220,7 @@ class CompletionStreamV1Request(BaseModel):
220220 """
221221 return_token_log_probs : Optional [bool ] = False
222222 """
223- Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
223+ Whether to return the log probabilities of the tokens.
224224 """
225225 presence_penalty : Optional [float ] = Field (default = None , ge = 0.0 , le = 2.0 )
226226 """
@@ -359,3 +359,104 @@ class ModelDownloadResponse(BaseModel):
359359
360360class DeleteLLMEndpointResponse (BaseModel ):
361361 deleted : bool
362+
363+
364+ class CreateBatchCompletionsRequestContent (BaseModel ):
365+ prompts : List [str ]
366+ max_new_tokens : int
367+ temperature : float = Field (ge = 0.0 , le = 1.0 )
368+ """
369+ Temperature of the sampling. Setting to 0 equals to greedy sampling.
370+ """
371+ stop_sequences : Optional [List [str ]] = None
372+ """
373+ List of sequences to stop the completion at.
374+ """
375+ return_token_log_probs : Optional [bool ] = False
376+ """
377+ Whether to return the log probabilities of the tokens.
378+ """
379+ presence_penalty : Optional [float ] = Field (default = None , ge = 0.0 , le = 2.0 )
380+ """
381+ Only supported in vllm, lightllm
382+ Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
383+ """
384+ frequency_penalty : Optional [float ] = Field (default = None , ge = 0.0 , le = 2.0 )
385+ """
386+ Only supported in vllm, lightllm
387+ Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
388+ """
389+ top_k : Optional [int ] = Field (default = None , ge = - 1 )
390+ """
391+ Controls the number of top tokens to consider. -1 means consider all tokens.
392+ """
393+ top_p : Optional [float ] = Field (default = None , gt = 0.0 , le = 1.0 )
394+ """
395+ Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
396+ """
397+
398+
399+ class CreateBatchCompletionsModelConfig (BaseModel ):
400+ model : str
401+ checkpoint_path : Optional [str ] = None
402+ """
403+ Path to the checkpoint to load the model from.
404+ """
405+ labels : Dict [str , str ]
406+ """
407+ Labels to attach to the batch inference job.
408+ """
409+ num_shards : Optional [int ] = 1
410+ """
411+ Suggested number of shards to distribute the model. When not specified, will infer the number of shards based on model config.
412+ System may decide to use a different number than the given value.
413+ """
414+ quantize : Optional [Quantization ] = None
415+ """
416+ Whether to quantize the model.
417+ """
418+ seed : Optional [int ] = None
419+ """
420+ Random seed for the model.
421+ """
422+
423+
424+ class CreateBatchCompletionsRequest (BaseModel ):
425+ """
426+ Request object for batch completions.
427+ """
428+
429+ input_data_path : Optional [str ]
430+ output_data_path : str
431+ """
432+ Path to the output file. The output file will be a JSON file of type List[CompletionOutput].
433+ """
434+ content : Optional [CreateBatchCompletionsRequestContent ] = None
435+ """
436+ Either `input_data_path` or `content` needs to be provided.
437+ When input_data_path is provided, the input file should be a JSON file of type BatchCompletionsRequestContent.
438+ """
439+ model_config : CreateBatchCompletionsModelConfig
440+ """
441+ Model configuration for the batch inference. Hardware configurations are inferred.
442+ """
443+ data_parallelism : Optional [int ] = Field (default = 1 , ge = 1 , le = 64 )
444+ """
445+ Number of replicas to run the batch inference. More replicas are slower to schedule but faster to inference.
446+ """
447+ max_runtime_sec : Optional [int ] = Field (default = 24 * 3600 , ge = 1 , le = 2 * 24 * 3600 )
448+ """
449+ Maximum runtime of the batch inference in seconds. Default to one day.
450+ """
451+
452+
453+ class CreateBatchCompletionsResponse (BaseModel ):
454+ job_id : str
455+
456+
457+ class GetBatchCompletionsResponse (BaseModel ):
458+ progress : float
459+ """
460+ Progress of the batch inference in percentage from 0 to 100.
461+ """
462+ finished : bool
0 commit comments