-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvLLM.py
More file actions
34 lines (26 loc) · 1.08 KB
/
vLLM.py
File metadata and controls
34 lines (26 loc) · 1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os
from pydantic import BaseModel, Field
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
class ResponseFormat(BaseModel):
name : str
age: int
num_gpus = len(os.environ.get("CUDA_VISIBLE_DEVICES", "").split(","))
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct",
trust_remote_code=True,
tensor_parallel_size=num_gpus,
hf_overrides={"architectures": ["Qwen2ForCausalLM"]}, )
json_schema = ResponseFormat.model_json_schema()
guided_decoding_params = GuidedDecodingParams(json=json_schema)
sampling_params = SamplingParams(
temperature=0.1,
top_p=0.9,
max_tokens=512,
guided_decoding=guided_decoding_params
)
inputs = [
"give me name and age of the person as json. \"I\'m vishva, I'm 26 years old '\"",
"give me name and age of the person as json. \"I\'m Vimal, I'm 45 years old '\"",
"give me name and age of the person as json. \"I\'m Ragul, I'm 31 years old '\""
]
outputs = llm.generate(prompts=inputs, sampling_params=sampling_params)