Ianmacleod/add mistral (#307)

ian-scale · web-flow · commit 70520e69a7cb · 2023-10-04T10:47:18.000-07:00
* add mistral 7b instruct

* adding mistral support

* update docs

* update docs again

* add mistral 7b max model len and max num batched tokens
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
@@ -2,22 +2,24 @@
 
 Scale hosts the following models in the LLM Engine Model Zoo:
 
-| Model Name            | Inference APIs Available | Fine-tuning APIs Available |
-| --------------------- | ------------------------ | -------------------------- |
-| `llama-7b`            | ✅                       | ✅                         |
-| `llama-2-7b`          | ✅                       | ✅                         |
-| `llama-2-7b-chat`     | ✅                       |                            |
-| `llama-2-13b`         | ✅                       |                            |
-| `llama-2-13b-chat`    | ✅                       |                            |
-| `llama-2-70b`         | ✅                       | ✅                         |
-| `llama-2-70b-chat`    | ✅                       |                            |
-| `falcon-7b`           | ✅                       |                            |
-| `falcon-7b-instruct`  | ✅                       |                            |
-| `falcon-40b`          | ✅                       |                            |
-| `falcon-40b-instruct` | ✅                       |                            |
-| `mpt-7b`              | ✅                       |                            |
-| `mpt-7b-instruct`     | ✅                       | ✅                         |
-| `flan-t5-xxl`         | ✅                       |                            |
+| Model Name            | Inference APIs Available | Fine-tuning APIs Available | Inference Frameworks Available |
+| --------------------- | ------------------------ | -------------------------- | ------------------------------ |
+| `llama-7b`            | ✅                       | ✅                         | deepspeed, text-generation-inference |
+| `llama-2-7b`          | ✅                       | ✅                         | text-generation-inference, vllm |
+| `llama-2-7b-chat`     | ✅                       |                            | text-generation-inference, vllm |
+| `llama-2-13b`         | ✅                       |                            | text-generation-inference, vllm |
+| `llama-2-13b-chat`    | ✅                       |                            | text-generation-inference, vllm |
+| `llama-2-70b`         | ✅                       | ✅                         | text-generation-inference, vllm |
+| `llama-2-70b-chat`    | ✅                       |                            | text-generation-inference, vllm |
+| `falcon-7b`           | ✅                       |                            | text-generation-inference, vllm |
+| `falcon-7b-instruct`  | ✅                       |                            | text-generation-inference, vllm | 
+| `falcon-40b`          | ✅                       |                            | text-generation-inference, vllm |
+| `falcon-40b-instruct` | ✅                       |                            | text-generation-inference, vllm |
+| `mpt-7b`              | ✅                       |                            | deepspeed, text-generation-inference, vllm |
+| `mpt-7b-instruct`     | ✅                       | ✅                         | deepspeed, text-generation-inference, vllm |
+| `flan-t5-xxl`         | ✅                       |                            | deepspeed, text-generation-inference |
+| `mistral-7b`         | ✅                       |                            | vllm | 
+| `mistral-7b-instruct`         | ✅                       |                            | vllm |
 
 ## Usage
 
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -117,6 +117,8 @@
         "falcon-7b-instruct": "tiiuae/falcon-7b-instruct",
         "falcon-40b": "tiiuae/falcon-40b",
         "falcon-40b-instruct": "tiiuae/falcon-40b-instruct",
+        "mistral-7b": "mistralai/Mistral-7B-v0.1",
+        "mistral-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1",
     },
     LLMInferenceFramework.LIGHTLLM: {
         "llama-7b": "decapoda-research/llama-7b-hf",
@@ -488,13 +490,21 @@ async def create_vllm_bundle(
         command = []
 
         max_num_batched_tokens = 2560  # vLLM's default
+        max_model_len = None
         if "llama-2" in model_name:
             max_num_batched_tokens = 4096  # Need to be bigger than model's context window
+        if "mistral" in model_name:
+            max_num_batched_tokens = 8000
+            max_model_len = 8000
 
         subcommands = []
         if checkpoint_path is not None:
             if checkpoint_path.startswith("s3://"):
-                final_weights_folder = "model_files"
+                # added as workaround since transformers doesn't support mistral yet, vllm expects "mistral" in model weights folder
+                if "mistral" in model_name:
+                    final_weights_folder = "mistral_files"
+                else:
+                    final_weights_folder = "model_files"
                 subcommands += self.load_model_weights_sub_commands(
                     LLMInferenceFramework.VLLM,
                     framework_image_tag,
@@ -508,9 +518,14 @@ async def create_vllm_bundle(
         else:
             final_weights_folder = _SUPPORTED_MODEL_NAMES[LLMInferenceFramework.VLLM][model_name]
 
-        subcommands.append(
-            f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens}"
-        )
+        if max_model_len:
+            subcommands.append(
+                f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens} --max-model-len {max_model_len}"
+            )
+        else:
+            subcommands.append(
+                f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens}"
+            )
 
         if quantize:
             if quantize == Quantization.AWQ:
diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt
@@ -1,3 +1,3 @@
 ray==2.6.3
-git+https://github.com/vllm-project/vllm.git@7d7e3b78a3c265ab3c57eeff43af56f509907998#egg=vllm
+vllm==0.2.0
 pydantic==1.10.12