Skip to content

Commit 110833b

Browse files
Allow H100 to be used (#522)
* Allow H100 to be used * Add MIG groups
1 parent a36f7a2 commit 110833b

File tree

7 files changed

+27
-12
lines changed

7 files changed

+27
-12
lines changed

clients/python/llmengine/data_types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ class GpuType(str, Enum):
4141
NVIDIA_AMPERE_A10 = "nvidia-ampere-a10"
4242
NVIDIA_AMPERE_A100 = "nvidia-ampere-a100"
4343
NVIDIA_AMPERE_A100E = "nvidia-ampere-a100e"
44+
NVIDIA_HOPPER_H100 = "nvidia-hopper-h100"
45+
NVIDIA_HOPPER_H100_1G_20GB = "nvidia-hopper-h100-1g20gb"
46+
NVIDIA_HOPPER_H100_3G_40GB = "nvidia-hopper-h100-3g40gb"
4447

4548

4649
class ModelEndpointType(str, Enum):

clients/python/llmengine/model.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def create(
148148
- ``nvidia-ampere-a10``
149149
- ``nvidia-ampere-a100``
150150
- ``nvidia-ampere-a100e``
151+
- ``nvidia-hopper-h100``
151152
152153
high_priority (`Optional[bool]`):
153154
Either ``True`` or ``False``. Enabling this will allow the created
@@ -531,6 +532,7 @@ def update(
531532
- ``nvidia-ampere-a10``
532533
- ``nvidia-ampere-a100``
533534
- ``nvidia-ampere-a100e``
535+
- ``nvidia-hopper-h100``
534536
535537
high_priority (`Optional[bool]`):
536538
Either ``True`` or ``False``. Enabling this will allow the created

docs/guides/self_hosting.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ Additionally, they must have the `k8s.amazonaws.com/accelerator` label set appro
2121
| --- | --- |
2222
| g4dn | nvidia-tesla-t4 |
2323
| g5 | nvidia-tesla-a10 |
24-
| p4d | nvidia-tesla-a100 |
25-
| p4de | nvidia-tesla-a100e |
24+
| p4d | nvidia-ampere-a100 |
25+
| p4de | nvidia-ampere-a100e |
26+
| p5 | nvidia-hopper-h100 |
2627

2728
We also recommend setting the following taint on your GPU nodes to prevent pods requiring GPU resources from being scheduled on them:
2829
- { key = "nvidia.com/gpu", value = "true", effect = "NO_SCHEDULE" }

model-engine/model_engine_server/common/resource_limits.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,19 @@
3434
) # Should we allow multi-gpu instances? This allows the largest single-gpu g5dn instance.
3535
# p4d.24xlarge, p4de.24xlarge
3636
A100_INSTANCE_LIMITS = dict(cpus=95, memory="1000Gi")
37+
H100_INSTANCE_LIMITS = dict(cpus=191, memory="2000Gi")
38+
H100_1G_20GB_INSTANCE_LIMITS = dict(cpus=47, memory="500Gi")
39+
H100_3G_40GB_INSTANCE_LIMITS = dict(cpus=95, memory="1000Gi")
3740
STORAGE_LIMIT = "500G" # TODO: figure out an actual limit.
3841
REQUESTS_BY_GPU_TYPE = {
3942
None: CPU_INSTANCE_LIMITS,
4043
GpuType.NVIDIA_TESLA_T4: T4_INSTANCE_LIMITS,
4144
GpuType.NVIDIA_AMPERE_A10: A10_INSTANCE_LIMITS,
4245
GpuType.NVIDIA_AMPERE_A100: A100_INSTANCE_LIMITS,
4346
GpuType.NVIDIA_AMPERE_A100E: A100_INSTANCE_LIMITS,
47+
GpuType.NVIDIA_HOPPER_H100: H100_INSTANCE_LIMITS,
48+
GpuType.NVIDIA_HOPPER_H100_1G_20GB: H100_1G_20GB_INSTANCE_LIMITS,
49+
GpuType.NVIDIA_HOPPER_H100_3G_40GB: H100_3G_40GB_INSTANCE_LIMITS,
4450
}
4551

4652
FORWARDER_CPU_USAGE = 1

model-engine/model_engine_server/domain/entities/gpu_type.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,6 @@ class GpuType(str, Enum):
88
NVIDIA_AMPERE_A10 = "nvidia-ampere-a10"
99
NVIDIA_AMPERE_A100 = "nvidia-ampere-a100"
1010
NVIDIA_AMPERE_A100E = "nvidia-ampere-a100e"
11+
NVIDIA_HOPPER_H100 = "nvidia-hopper-h100"
12+
NVIDIA_HOPPER_H100_1G_20GB = "nvidia-hopper-h100-1g20gb"
13+
NVIDIA_HOPPER_H100_3G_40GB = "nvidia-hopper-h100-3g40gb"

model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2286,25 +2286,25 @@ def _infer_hardware(
22862286
gpus = 2
22872287
memory = "160Gi"
22882288
storage = "160Gi"
2289-
gpu_type = GpuType.NVIDIA_AMPERE_A100E
2289+
gpu_type = GpuType.NVIDIA_HOPPER_H100
22902290
elif min_memory_gb <= 320:
22912291
cpus = "40"
22922292
gpus = 4
22932293
memory = "320Gi"
22942294
storage = "320Gi"
2295-
gpu_type = GpuType.NVIDIA_AMPERE_A100E
2295+
gpu_type = GpuType.NVIDIA_HOPPER_H100
22962296
elif min_memory_gb <= 640:
22972297
cpus = "80"
22982298
gpus = 8
22992299
memory = "800Gi"
23002300
storage = "460Gi"
2301-
gpu_type = GpuType.NVIDIA_AMPERE_A100E
2301+
gpu_type = GpuType.NVIDIA_HOPPER_H100
23022302
elif "llama-3-8b-instruct-262k" in model_name:
23032303
cpus = "20"
23042304
gpus = 2
23052305
memory = "40Gi"
23062306
storage = "40Gi"
2307-
gpu_type = GpuType.NVIDIA_AMPERE_A100E
2307+
gpu_type = GpuType.NVIDIA_HOPPER_H100
23082308
else:
23092309
raise ObjectHasInvalidValueException(f"Unable to infer hardware for {model_name}.")
23102310

model-engine/tests/unit/domain/test_llm_use_cases.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,7 +1848,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
18481848
assert hardware.gpus == 2
18491849
assert hardware.memory == "160Gi"
18501850
assert hardware.storage == "160Gi"
1851-
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
1851+
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
18521852

18531853
fake_llm_artifact_gateway.model_config = {
18541854
"architectures": ["MixtralForCausalLM"],
@@ -1879,7 +1879,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
18791879
assert hardware.gpus == 8
18801880
assert hardware.memory == "800Gi"
18811881
assert hardware.storage == "460Gi"
1882-
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
1882+
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
18831883

18841884
fake_llm_artifact_gateway.model_config = {
18851885
"_name_or_path": "meta-llama/Llama-2-7b-hf",
@@ -2015,7 +2015,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
20152015
assert hardware.gpus == 2
20162016
assert hardware.memory == "160Gi"
20172017
assert hardware.storage == "160Gi"
2018-
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
2018+
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
20192019

20202020
fake_llm_artifact_gateway.model_config = {
20212021
"architectures": ["LlamaForCausalLM"],
@@ -2043,7 +2043,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
20432043
assert hardware.gpus == 2
20442044
assert hardware.memory == "160Gi"
20452045
assert hardware.storage == "160Gi"
2046-
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
2046+
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
20472047

20482048
# (TODO) figure out how to calculate memory for llama-3-8b-instruct-262k
20492049
# fake_llm_artifact_gateway.model_config = {
@@ -2073,7 +2073,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
20732073
# assert hardware.gpus == 2
20742074
# assert hardware.memory == "160Gi"
20752075
# assert hardware.storage == "160Gi"
2076-
# assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
2076+
# assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
20772077

20782078
with pytest.raises(ObjectHasInvalidValueException):
20792079
_infer_hardware(fake_llm_artifact_gateway, "unsupported_model", "")
@@ -2095,7 +2095,7 @@ def test_fill_hardware_info(fake_llm_artifact_gateway):
20952095
assert request.gpus == 2
20962096
assert request.memory == "160Gi"
20972097
assert request.storage == "160Gi"
2098-
assert request.gpu_type == GpuType.NVIDIA_AMPERE_A100E
2098+
assert request.gpu_type == GpuType.NVIDIA_HOPPER_H100
20992099

21002100
request = CreateLLMModelEndpointV1Request(
21012101
name="mixtral-8x7b",

0 commit comments

Comments
 (0)