Skip to content

Commit 1f474ba

Browse files
Allow hardware infer from client (#555)
* Allow hardware infer from client * Fix
1 parent 8860ee3 commit 1f474ba

File tree

2 files changed

+40
-18
lines changed

2 files changed

+40
-18
lines changed

clients/python/llmengine/data_types.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,9 @@ class CreateLLMEndpointRequest(BaseModel):
164164
metadata: Dict[str, Any] # TODO: JSON type
165165
post_inference_hooks: Optional[List[str]]
166166
endpoint_type: ModelEndpointType = ModelEndpointType.STREAMING
167-
cpus: CpuSpecificationType
168-
gpus: int
169-
memory: StorageSpecificationType
167+
cpus: Optional[CpuSpecificationType]
168+
gpus: Optional[int]
169+
memory: Optional[StorageSpecificationType]
170170
gpu_type: Optional[GpuType]
171171
storage: Optional[StorageSpecificationType]
172172
optimize_costs: Optional[bool] = None

clients/python/llmengine/model.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,15 +43,15 @@ def create(
4343
quantize: Optional[Quantization] = None,
4444
checkpoint_path: Optional[str] = None,
4545
# General endpoint fields
46-
cpus: int = 8,
47-
memory: str = "24Gi",
48-
storage: str = "40Gi",
49-
gpus: int = 1,
46+
cpus: Optional[int] = None,
47+
memory: Optional[str] = None,
48+
storage: Optional[str] = None,
49+
gpus: Optional[int] = None,
5050
min_workers: int = 0,
5151
max_workers: int = 1,
5252
per_worker: int = 2,
5353
endpoint_type: ModelEndpointType = ModelEndpointType.STREAMING,
54-
gpu_type: Optional[str] = "nvidia-ampere-a10",
54+
gpu_type: Optional[str] = None,
5555
high_priority: Optional[bool] = False,
5656
post_inference_hooks: Optional[List[PostInferenceHooks]] = None,
5757
default_callback_url: Optional[str] = None,
@@ -91,21 +91,23 @@ def create(
9191
Can be either a folder or a tar file. Folder is preferred since we don't need to untar and model loads faster.
9292
For model weights, safetensors are preferred but PyTorch checkpoints are also accepted (model loading will be longer).
9393
94-
cpus (`int`):
94+
cpus (`Optional[int]`):
9595
Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater
96-
than or equal to 1. Recommendation is set it to 8 * GPU count.
96+
than or equal to 1. Recommendation is set it to 8 * GPU count. Can be inferred from the model size.
9797
98-
memory (`str`):
98+
memory (`Optional[str]`):
9999
Amount of memory each worker should get, e.g. "4Gi", "512Mi", etc. This must
100100
be a positive amount of memory. Recommendation is set it to 24Gi * GPU count.
101+
Can be inferred from the model size.
101102
102-
storage (`str`):
103+
storage (`Optional[str]`):
103104
Amount of local ephemeral storage each worker should get, e.g. "4Gi",
104105
"512Mi", etc. This must be a positive amount of storage.
105106
Recommendataion is 40Gi for 7B models, 80Gi for 13B models and 200Gi for 70B models.
107+
Can be inferred from the model size.
106108
107-
gpus (`int`):
108-
Number of gpus each worker should get, e.g. 0, 1, etc.
109+
gpus (`Optional[int]`):
110+
Number of gpus each worker should get, e.g. 0, 1, etc. Can be inferred from the model size.
109111
110112
min_workers (`int`):
111113
The minimum number of workers. Must be greater than or equal to 0. This
@@ -142,15 +144,15 @@ def create(
142144
143145
gpu_type (`Optional[str]`):
144146
If specifying a non-zero number of gpus, this controls the type of gpu
145-
requested. Here are the supported values:
147+
requested. Can be inferred from the model size. Here are the supported values:
146148
147149
- ``nvidia-tesla-t4``
148150
- ``nvidia-ampere-a10``
149151
- ``nvidia-ampere-a100``
150152
- ``nvidia-ampere-a100e``
151153
- ``nvidia-hopper-h100``
152-
- ``nvidia-hopper-h100-1g20gb``
153-
- ``nvidia-hopper-h100-3g40gb``
154+
- ``nvidia-hopper-h100-1g20gb`` # 1 slice of MIG with 1g compute and 20GB memory
155+
- ``nvidia-hopper-h100-3g40gb`` # 1 slice of MIG with 3g compute and 40GB memory
154156
155157
high_priority (`Optional[bool]`):
156158
Either ``True`` or ``False``. Enabling this will allow the created
@@ -173,7 +175,27 @@ def create(
173175
Returns:
174176
CreateLLMEndpointResponse: creation task ID of the created Model. Currently not used.
175177
176-
=== "Create Llama 2 7B model in Python"
178+
=== "Create Llama 2 70B model with hardware specs inferred in Python"
179+
```python
180+
from llmengine import Model
181+
182+
response = Model.create(
183+
name="llama-2-70b-test"
184+
model="llama-2-70b",
185+
inference_framework_image_tag="0.9.4",
186+
inference_framework=LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
187+
num_shards=4,
188+
checkpoint_path="s3://path/to/checkpoint",
189+
min_workers=0,
190+
max_workers=1,
191+
per_worker=10,
192+
endpoint_type=ModelEndpointType.STREAMING,
193+
public_inference=False,
194+
)
195+
196+
print(response.json())
197+
```
198+
=== "Create Llama 2 7B model with hardware specs specified in Python"
177199
```python
178200
from llmengine import Model
179201

0 commit comments

Comments
 (0)