Skip to content

Commit 9367514

Browse files
authored
mistral 3.1 small (#420)
for real use cases we'll need to raise to TP2 to enable higher sequence length and more images per request, but in-line with gemma 27B, pushing with TP 1 H100.
1 parent b69f4ad commit 9367514

File tree

1 file changed

+57
-0
lines changed

1 file changed

+57
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral --enable-auto-tool-choice --limit_mm_per_prompt 'image=10' --tensor-parallel-size 2
2+
base_image:
3+
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:c0efdd655b4ce9188f93b0030dcdebcf43858914
4+
build_commands:
5+
- pip install git+https://github.com/huggingface/transformers@cbfb8d7b27b4724f60c4085842f5150dbd3b41f3
6+
model_metadata:
7+
repo_id: mistralai/Mistral-Small-3.1-24B-Instruct-2503
8+
example_model_input: {
9+
"model": "mistral",
10+
"messages": [
11+
{
12+
"role": "user",
13+
"content": [
14+
{
15+
"type": "text",
16+
"text": "Describe this image in one sentence."
17+
},
18+
{
19+
"type": "image_url",
20+
"image_url": {
21+
"url": "https://picsum.photos/id/237/200/300"
22+
}
23+
}
24+
]
25+
}
26+
],
27+
"stream": true,
28+
"max_tokens": 512,
29+
"temperature": 0.5
30+
}
31+
tags:
32+
- openai-compatible
33+
docker_server:
34+
start_command: "sh -c \"VLLM_USE_V1=1 HF_TOKEN=$(cat /secrets/hf_access_token) vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral --enable-auto-tool-choice --served-model-name mistral --max-num-seqs 8 --max-model-len 16384 --limit_mm_per_prompt 'image=1' --tensor-parallel-size 1 --gpu-memory-utilization 0.95\""
35+
readiness_endpoint: /health
36+
liveness_endpoint: /health
37+
predict_endpoint: /v1/chat/completions
38+
server_port: 8000
39+
environment_variables:
40+
VLLM_LOGGING_LEVEL: INFO
41+
hf_access_token: null
42+
requirements:
43+
- huggingface_hub
44+
- hf_transfer
45+
- datasets
46+
resources:
47+
accelerator: H100:1
48+
use_gpu: true
49+
secrets:
50+
hf_access_token: null
51+
runtime:
52+
health_checks:
53+
restart_check_delay_seconds: 300 # Waits 5 minutes after deployment before starting health checks
54+
restart_threshold_seconds: 300 # Triggers a restart if health checks fail for 5 minutes
55+
stop_traffic_threshold_seconds: 120 # Stops traffic if health checks fail for 2 minutes
56+
predict_concurrency : 8
57+
model_name: Mistral Small 3.1

0 commit comments

Comments
 (0)