Allow H100 to be used (#522)

yunfeng-scale · web-flow · commit 110833b62121 · 2024-05-17T15:10:18.000-07:00
* Allow H100 to be used

* Add MIG groups
diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
@@ -41,6 +41,9 @@ class GpuType(str, Enum):
     NVIDIA_AMPERE_A10 = "nvidia-ampere-a10"
     NVIDIA_AMPERE_A100 = "nvidia-ampere-a100"
     NVIDIA_AMPERE_A100E = "nvidia-ampere-a100e"
+    NVIDIA_HOPPER_H100 = "nvidia-hopper-h100"
+    NVIDIA_HOPPER_H100_1G_20GB = "nvidia-hopper-h100-1g20gb"
+    NVIDIA_HOPPER_H100_3G_40GB = "nvidia-hopper-h100-3g40gb"
 
 
 class ModelEndpointType(str, Enum):
diff --git a/clients/python/llmengine/model.py b/clients/python/llmengine/model.py
@@ -148,6 +148,7 @@ def create(
                 - ``nvidia-ampere-a10``
                 - ``nvidia-ampere-a100``
                 - ``nvidia-ampere-a100e``
+                - ``nvidia-hopper-h100``
 
             high_priority (`Optional[bool]`):
                 Either ``True`` or ``False``. Enabling this will allow the created
@@ -531,6 +532,7 @@ def update(
                 - ``nvidia-ampere-a10``
                 - ``nvidia-ampere-a100``
                 - ``nvidia-ampere-a100e``
+                - ``nvidia-hopper-h100``
 
             high_priority (`Optional[bool]`):
                 Either ``True`` or ``False``. Enabling this will allow the created
diff --git a/docs/guides/self_hosting.md b/docs/guides/self_hosting.md
@@ -21,8 +21,9 @@ Additionally, they must have the `k8s.amazonaws.com/accelerator` label set appro
 | --- | --- |
 | g4dn | nvidia-tesla-t4 |
 | g5 | nvidia-tesla-a10 |
-| p4d | nvidia-tesla-a100 |
-| p4de | nvidia-tesla-a100e |
+| p4d | nvidia-ampere-a100 |
+| p4de | nvidia-ampere-a100e |
+| p5 | nvidia-hopper-h100 |
 
 We also recommend setting the following taint on your GPU nodes to prevent pods requiring GPU resources from being scheduled on them:
 - { key = "nvidia.com/gpu", value = "true", effect = "NO_SCHEDULE" }
diff --git a/model-engine/model_engine_server/common/resource_limits.py b/model-engine/model_engine_server/common/resource_limits.py
@@ -34,13 +34,19 @@
 )  # Should we allow multi-gpu instances? This allows the largest single-gpu g5dn instance.
 # p4d.24xlarge, p4de.24xlarge
 A100_INSTANCE_LIMITS = dict(cpus=95, memory="1000Gi")
+H100_INSTANCE_LIMITS = dict(cpus=191, memory="2000Gi")
+H100_1G_20GB_INSTANCE_LIMITS = dict(cpus=47, memory="500Gi")
+H100_3G_40GB_INSTANCE_LIMITS = dict(cpus=95, memory="1000Gi")
 STORAGE_LIMIT = "500G"  # TODO: figure out an actual limit.
 REQUESTS_BY_GPU_TYPE = {
     None: CPU_INSTANCE_LIMITS,
     GpuType.NVIDIA_TESLA_T4: T4_INSTANCE_LIMITS,
     GpuType.NVIDIA_AMPERE_A10: A10_INSTANCE_LIMITS,
     GpuType.NVIDIA_AMPERE_A100: A100_INSTANCE_LIMITS,
     GpuType.NVIDIA_AMPERE_A100E: A100_INSTANCE_LIMITS,
+    GpuType.NVIDIA_HOPPER_H100: H100_INSTANCE_LIMITS,
+    GpuType.NVIDIA_HOPPER_H100_1G_20GB: H100_1G_20GB_INSTANCE_LIMITS,
+    GpuType.NVIDIA_HOPPER_H100_3G_40GB: H100_3G_40GB_INSTANCE_LIMITS,
 }
 
 FORWARDER_CPU_USAGE = 1
diff --git a/model-engine/model_engine_server/domain/entities/gpu_type.py b/model-engine/model_engine_server/domain/entities/gpu_type.py
@@ -8,3 +8,6 @@ class GpuType(str, Enum):
     NVIDIA_AMPERE_A10 = "nvidia-ampere-a10"
     NVIDIA_AMPERE_A100 = "nvidia-ampere-a100"
     NVIDIA_AMPERE_A100E = "nvidia-ampere-a100e"
+    NVIDIA_HOPPER_H100 = "nvidia-hopper-h100"
+    NVIDIA_HOPPER_H100_1G_20GB = "nvidia-hopper-h100-1g20gb"
+    NVIDIA_HOPPER_H100_3G_40GB = "nvidia-hopper-h100-3g40gb"
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -2286,25 +2286,25 @@ def _infer_hardware(
         gpus = 2
         memory = "160Gi"
         storage = "160Gi"
-        gpu_type = GpuType.NVIDIA_AMPERE_A100E
+        gpu_type = GpuType.NVIDIA_HOPPER_H100
     elif min_memory_gb <= 320:
         cpus = "40"
         gpus = 4
         memory = "320Gi"
         storage = "320Gi"
-        gpu_type = GpuType.NVIDIA_AMPERE_A100E
+        gpu_type = GpuType.NVIDIA_HOPPER_H100
     elif min_memory_gb <= 640:
         cpus = "80"
         gpus = 8
         memory = "800Gi"
         storage = "460Gi"
-        gpu_type = GpuType.NVIDIA_AMPERE_A100E
+        gpu_type = GpuType.NVIDIA_HOPPER_H100
     elif "llama-3-8b-instruct-262k" in model_name:
         cpus = "20"
         gpus = 2
         memory = "40Gi"
         storage = "40Gi"
-        gpu_type = GpuType.NVIDIA_AMPERE_A100E
+        gpu_type = GpuType.NVIDIA_HOPPER_H100
     else:
         raise ObjectHasInvalidValueException(f"Unable to infer hardware for {model_name}.")
 
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -1848,7 +1848,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.gpus == 2
     assert hardware.memory == "160Gi"
     assert hardware.storage == "160Gi"
-    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
     fake_llm_artifact_gateway.model_config = {
         "architectures": ["MixtralForCausalLM"],
@@ -1879,7 +1879,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.gpus == 8
     assert hardware.memory == "800Gi"
     assert hardware.storage == "460Gi"
-    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
     fake_llm_artifact_gateway.model_config = {
         "_name_or_path": "meta-llama/Llama-2-7b-hf",
@@ -2015,7 +2015,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.gpus == 2
     assert hardware.memory == "160Gi"
     assert hardware.storage == "160Gi"
-    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
     fake_llm_artifact_gateway.model_config = {
         "architectures": ["LlamaForCausalLM"],
@@ -2043,7 +2043,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.gpus == 2
     assert hardware.memory == "160Gi"
     assert hardware.storage == "160Gi"
-    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
     # (TODO) figure out how to calculate memory for llama-3-8b-instruct-262k
     # fake_llm_artifact_gateway.model_config = {
@@ -2073,7 +2073,7 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     # assert hardware.gpus == 2
     # assert hardware.memory == "160Gi"
     # assert hardware.storage == "160Gi"
-    # assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
+    # assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
     with pytest.raises(ObjectHasInvalidValueException):
         _infer_hardware(fake_llm_artifact_gateway, "unsupported_model", "")
@@ -2095,7 +2095,7 @@ def test_fill_hardware_info(fake_llm_artifact_gateway):
     assert request.gpus == 2
     assert request.memory == "160Gi"
     assert request.storage == "160Gi"
-    assert request.gpu_type == GpuType.NVIDIA_AMPERE_A100E
+    assert request.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
     request = CreateLLMModelEndpointV1Request(
         name="mixtral-8x7b",