Properly add mixtral 8x22b (#493)

yunfeng-scale · web-flow · commit a995869ed958 · 2024-04-16T13:50:09.000-07:00
* Properly add mixtral 8x22b

* unit test
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
@@ -22,6 +22,7 @@ Scale hosts the following models in the LLM Engine Model Zoo:
 | `mistral-7b-instruct`    | ✅                       | ✅                         | vllm                                       | 8000                                           |
 | `mixtral-8x7b`           | ✅                       |                            | vllm                                       | 32768                                          |
 | `mixtral-8x7b-instruct`  | ✅                       |                            | vllm                                       | 32768                                          |
+| `mixtral-8x22b`          | ✅                       |                            | vllm                                       | 65536                                          |
 | `codellama-7b`           | ✅                       | ✅                         | text-generation-inference, vllm            | 16384                                          |
 | `codellama-7b-instruct`  | ✅                       | ✅                         | text-generation-inference, vllm            | 16384                                          |
 | `codellama-13b`          | ✅                       | ✅                         | text-generation-inference, vllm            | 16384                                          |
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -180,6 +180,7 @@
             "mistral-7b-instruct",
             "mixtral-8x7b",
             "mixtral-8x7b-instruct",
+            "mixtral-8x22b",
             "mammoth-coder-llama-2-7b",
             "mammoth-coder-llama-2-13b",
             "mammoth-coder-llama-2-34b",
@@ -230,7 +231,8 @@
     "gemma": {"max_model_len": 8192, "max_num_batched_tokens": 8192},
     "llama-2": {"max_model_len": None, "max_num_batched_tokens": 4096},
     "mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000},
-    "mixtral": {"max_model_len": 32768, "max_num_batched_tokens": 32768},
+    "mixtral-8x7b": {"max_model_len": 32768, "max_num_batched_tokens": 32768},
+    "mixtral-8x22b": {"max_model_len": 65536, "max_num_batched_tokens": 65536},
     "zephyr": {"max_model_len": 32768, "max_num_batched_tokens": 32768},
 }
 
@@ -2200,6 +2202,12 @@ def infer_hardware_from_model_name(model_name: str) -> CreateDockerImageBatchJob
         memory = "160Gi"
         storage = "160Gi"
         gpu_type = GpuType.NVIDIA_AMPERE_A100E
+    elif "mixtral-8x22b" in model_name:
+        cpus = "80"
+        gpus = 8
+        memory = "800Gi"
+        storage = "460Gi"
+        gpu_type = GpuType.NVIDIA_AMPERE_A100E
     else:
         numbers = re.findall(r"\d+", model_name)
         if len(numbers) == 0:
diff --git a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py
@@ -62,6 +62,7 @@ def get_default_supported_models_info() -> Dict[str, ModelInfo]:
         "mistral-7b-instruct": ModelInfo("mistralai/Mistral-7B-Instruct-v0.1", None),
         "mixtral-8x7b": ModelInfo("mistralai/Mixtral-8x7B-v0.1", None),
         "mixtral-8x7b-instruct": ModelInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", None),
+        "mixtral-8x22b": ModelInfo("mistral-community/Mixtral-8x22B-v0.1", None),
         "mammoth-coder-llama-2-7b": ModelInfo("TIGER-Lab/MAmmoTH-Coder-7B", None),
         "mammoth-coder-llama-2-13b": ModelInfo("TIGER-Lab/MAmmoTH-Coder-13B", None),
         "mammoth-coder-llama-2-34b": ModelInfo("TIGER-Lab/MAmmoTH-Coder-34B", None),
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -1705,6 +1705,13 @@ def test_infer_hardware_from_model_name():
     assert hardware.storage == "160Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
 
+    hardware = infer_hardware_from_model_name("mixtral-8x22b")
+    assert hardware.cpus == "80"
+    assert hardware.gpus == 8
+    assert hardware.memory == "800Gi"
+    assert hardware.storage == "460Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A100E
+
     hardware = infer_hardware_from_model_name("llama-2-7b")
     assert hardware.cpus == "10"
     assert hardware.gpus == 1