@@ -43,15 +43,15 @@ def create(
4343 quantize : Optional [Quantization ] = None ,
4444 checkpoint_path : Optional [str ] = None ,
4545 # General endpoint fields
46- cpus : int = 8 ,
47- memory : str = "24Gi" ,
48- storage : str = "40Gi" ,
49- gpus : int = 1 ,
46+ cpus : Optional [ int ] = None ,
47+ memory : Optional [ str ] = None ,
48+ storage : Optional [ str ] = None ,
49+ gpus : Optional [ int ] = None ,
5050 min_workers : int = 0 ,
5151 max_workers : int = 1 ,
5252 per_worker : int = 2 ,
5353 endpoint_type : ModelEndpointType = ModelEndpointType .STREAMING ,
54- gpu_type : Optional [str ] = "nvidia-ampere-a10" ,
54+ gpu_type : Optional [str ] = None ,
5555 high_priority : Optional [bool ] = False ,
5656 post_inference_hooks : Optional [List [PostInferenceHooks ]] = None ,
5757 default_callback_url : Optional [str ] = None ,
@@ -91,21 +91,23 @@ def create(
9191 Can be either a folder or a tar file. Folder is preferred since we don't need to untar and model loads faster.
9292 For model weights, safetensors are preferred but PyTorch checkpoints are also accepted (model loading will be longer).
9393
94- cpus (`int`):
94+ cpus (`Optional[ int] `):
9595 Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater
96- than or equal to 1. Recommendation is set it to 8 * GPU count.
96+ than or equal to 1. Recommendation is set it to 8 * GPU count. Can be inferred from the model size.
9797
98- memory (`str`):
98+ memory (`Optional[ str] `):
9999 Amount of memory each worker should get, e.g. "4Gi", "512Mi", etc. This must
100100 be a positive amount of memory. Recommendation is set it to 24Gi * GPU count.
101+ Can be inferred from the model size.
101102
102- storage (`str`):
103+ storage (`Optional[ str] `):
103104 Amount of local ephemeral storage each worker should get, e.g. "4Gi",
104105 "512Mi", etc. This must be a positive amount of storage.
105106 Recommendataion is 40Gi for 7B models, 80Gi for 13B models and 200Gi for 70B models.
107+ Can be inferred from the model size.
106108
107- gpus (`int`):
108- Number of gpus each worker should get, e.g. 0, 1, etc.
109+ gpus (`Optional[ int] `):
110+ Number of gpus each worker should get, e.g. 0, 1, etc. Can be inferred from the model size.
109111
110112 min_workers (`int`):
111113 The minimum number of workers. Must be greater than or equal to 0. This
@@ -142,15 +144,15 @@ def create(
142144
143145 gpu_type (`Optional[str]`):
144146 If specifying a non-zero number of gpus, this controls the type of gpu
145- requested. Here are the supported values:
147+ requested. Can be inferred from the model size. Here are the supported values:
146148
147149 - ``nvidia-tesla-t4``
148150 - ``nvidia-ampere-a10``
149151 - ``nvidia-ampere-a100``
150152 - ``nvidia-ampere-a100e``
151153 - ``nvidia-hopper-h100``
152- - ``nvidia-hopper-h100-1g20gb``
153- - ``nvidia-hopper-h100-3g40gb``
154+ - ``nvidia-hopper-h100-1g20gb`` # 1 slice of MIG with 1g compute and 20GB memory
155+ - ``nvidia-hopper-h100-3g40gb`` # 1 slice of MIG with 3g compute and 40GB memory
154156
155157 high_priority (`Optional[bool]`):
156158 Either ``True`` or ``False``. Enabling this will allow the created
@@ -173,7 +175,27 @@ def create(
173175 Returns:
174176 CreateLLMEndpointResponse: creation task ID of the created Model. Currently not used.
175177
176- === "Create Llama 2 7B model in Python"
178+ === "Create Llama 2 70B model with hardware specs inferred in Python"
179+ ```python
180+ from llmengine import Model
181+
182+ response = Model.create(
183+ name="llama-2-70b-test"
184+ model="llama-2-70b",
185+ inference_framework_image_tag="0.9.4",
186+ inference_framework=LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
187+ num_shards=4,
188+ checkpoint_path="s3://path/to/checkpoint",
189+ min_workers=0,
190+ max_workers=1,
191+ per_worker=10,
192+ endpoint_type=ModelEndpointType.STREAMING,
193+ public_inference=False,
194+ )
195+
196+ print(response.json())
197+ ```
198+ === "Create Llama 2 7B model with hardware specs specified in Python"
177199 ```python
178200 from llmengine import Model
179201
0 commit comments