Skip to content

Commit 70520e6

Browse files
authored
Ianmacleod/add mistral (#307)
* add mistral 7b instruct * adding mistral support * update docs * update docs again * add mistral 7b max model len and max num batched tokens
1 parent 7667f3b commit 70520e6

File tree

3 files changed

+38
-21
lines changed

3 files changed

+38
-21
lines changed

docs/model_zoo.md

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,24 @@
22

33
Scale hosts the following models in the LLM Engine Model Zoo:
44

5-
| Model Name | Inference APIs Available | Fine-tuning APIs Available |
6-
| --------------------- | ------------------------ | -------------------------- |
7-
| `llama-7b` |||
8-
| `llama-2-7b` |||
9-
| `llama-2-7b-chat` || |
10-
| `llama-2-13b` || |
11-
| `llama-2-13b-chat` || |
12-
| `llama-2-70b` |||
13-
| `llama-2-70b-chat` || |
14-
| `falcon-7b` || |
15-
| `falcon-7b-instruct` || |
16-
| `falcon-40b` || |
17-
| `falcon-40b-instruct` || |
18-
| `mpt-7b` || |
19-
| `mpt-7b-instruct` |||
20-
| `flan-t5-xxl` || |
5+
| Model Name | Inference APIs Available | Fine-tuning APIs Available | Inference Frameworks Available |
6+
| --------------------- | ------------------------ | -------------------------- | ------------------------------ |
7+
| `llama-7b` ||| deepspeed, text-generation-inference |
8+
| `llama-2-7b` ||| text-generation-inference, vllm |
9+
| `llama-2-7b-chat` || | text-generation-inference, vllm |
10+
| `llama-2-13b` || | text-generation-inference, vllm |
11+
| `llama-2-13b-chat` || | text-generation-inference, vllm |
12+
| `llama-2-70b` ||| text-generation-inference, vllm |
13+
| `llama-2-70b-chat` || | text-generation-inference, vllm |
14+
| `falcon-7b` || | text-generation-inference, vllm |
15+
| `falcon-7b-instruct` || | text-generation-inference, vllm |
16+
| `falcon-40b` || | text-generation-inference, vllm |
17+
| `falcon-40b-instruct` || | text-generation-inference, vllm |
18+
| `mpt-7b` || | deepspeed, text-generation-inference, vllm |
19+
| `mpt-7b-instruct` ||| deepspeed, text-generation-inference, vllm |
20+
| `flan-t5-xxl` || | deepspeed, text-generation-inference |
21+
| `mistral-7b` || | vllm |
22+
| `mistral-7b-instruct` || | vllm |
2123

2224
## Usage
2325

model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@
117117
"falcon-7b-instruct": "tiiuae/falcon-7b-instruct",
118118
"falcon-40b": "tiiuae/falcon-40b",
119119
"falcon-40b-instruct": "tiiuae/falcon-40b-instruct",
120+
"mistral-7b": "mistralai/Mistral-7B-v0.1",
121+
"mistral-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1",
120122
},
121123
LLMInferenceFramework.LIGHTLLM: {
122124
"llama-7b": "decapoda-research/llama-7b-hf",
@@ -488,13 +490,21 @@ async def create_vllm_bundle(
488490
command = []
489491

490492
max_num_batched_tokens = 2560 # vLLM's default
493+
max_model_len = None
491494
if "llama-2" in model_name:
492495
max_num_batched_tokens = 4096 # Need to be bigger than model's context window
496+
if "mistral" in model_name:
497+
max_num_batched_tokens = 8000
498+
max_model_len = 8000
493499

494500
subcommands = []
495501
if checkpoint_path is not None:
496502
if checkpoint_path.startswith("s3://"):
497-
final_weights_folder = "model_files"
503+
# added as workaround since transformers doesn't support mistral yet, vllm expects "mistral" in model weights folder
504+
if "mistral" in model_name:
505+
final_weights_folder = "mistral_files"
506+
else:
507+
final_weights_folder = "model_files"
498508
subcommands += self.load_model_weights_sub_commands(
499509
LLMInferenceFramework.VLLM,
500510
framework_image_tag,
@@ -508,9 +518,14 @@ async def create_vllm_bundle(
508518
else:
509519
final_weights_folder = _SUPPORTED_MODEL_NAMES[LLMInferenceFramework.VLLM][model_name]
510520

511-
subcommands.append(
512-
f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens}"
513-
)
521+
if max_model_len:
522+
subcommands.append(
523+
f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens} --max-model-len {max_model_len}"
524+
)
525+
else:
526+
subcommands.append(
527+
f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens}"
528+
)
514529

515530
if quantize:
516531
if quantize == Quantization.AWQ:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
ray==2.6.3
2-
git+https://github.com/vllm-project/vllm.git@7d7e3b78a3c265ab3c57eeff43af56f509907998#egg=vllm
2+
vllm==0.2.0
33
pydantic==1.10.12

0 commit comments

Comments
 (0)