2
2
3
3
from vllm import LLM , SamplingParams
4
4
5
- os .environ ["VLLM_SKIP_WARMUP" ] = "true"
6
- prompts = [
7
- "Hello, my name is" ,
8
- "0.999 compares to 0.9 is " ,
9
- "The capital of France is" ,
10
- "The future of AI is" ,
11
- ]
12
- sampling_params = SamplingParams (temperature = 0 , max_tokens = 50 )
13
- model = "/mnt/weka/llm/Qwen3/Qwen3-30B-A3B/"
14
- # model = "/mnt/weka/llm/Qwen3/Qwen3-32B/"
15
- # model = "meta-llama/Llama-3.2-1B-Instruct"
16
- # model = "/mnt/weka/llm/DeepSeek-V2-Lite-Chat/"
17
- # model = "/mnt/weka/data/mlperf_models/Mixtral-8x7B-Instruct-v0.1"
18
- # model = "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B/"
19
- kwargs = {"tensor_parallel_size" : 1 }
20
- if os .path .basename (model ) in ["Qwen3-30B-A3B" , "DeepSeek-V2-Lite-Chat" ]:
21
- kwargs ["enable_expert_parallel" ] = True
22
- llm = LLM (model = model , max_model_len = 4096 , trust_remote_code = True , ** kwargs )
5
+ def main ():
6
+ os .environ ["VLLM_SKIP_WARMUP" ] = "true"
7
+ prompts = [
8
+ "Hello, my name is" ,
9
+ "0.999 compares to 0.9 is " ,
10
+ "The capital of France is" ,
11
+ "The future of AI is" ,
12
+ ]
13
+ sampling_params = SamplingParams (temperature = 0 , max_tokens = 50 )
14
+ model = "/mnt/weka/llm/Qwen3/Qwen3-30B-A3B/"
15
+ # model = "/mnt/weka/llm/Qwen3/Qwen3-32B/"
16
+ # model = "meta-llama/Llama-3.2-1B-Instruct"
17
+ # model = "/mnt/weka/llm/DeepSeek-V2-Lite-Chat/"
18
+ # model = "/mnt/weka/data/mlperf_models/Mixtral-8x7B-Instruct-v0.1"
19
+ # model = "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B/"
20
+ kwargs = {"tensor_parallel_size" : 2 }
21
+ if os .path .basename (model ) in ["Qwen3-30B-A3B" , "DeepSeek-V2-Lite-Chat" ]:
22
+ kwargs ["enable_expert_parallel" ] = True
23
+ llm = LLM (model = model , max_model_len = 4096 , trust_remote_code = True , ** kwargs )
23
24
24
- outputs = llm .generate (prompts , sampling_params )
25
+ outputs = llm .generate (prompts , sampling_params )
25
26
26
- for output in outputs :
27
- prompt = output .prompt
28
- generated_text = output .outputs [0 ].text
29
- print (f"Prompt: { prompt !r} , Generated text: { generated_text !r} " )
27
+ for output in outputs :
28
+ prompt = output .prompt
29
+ generated_text = output .outputs [0 ].text
30
+ print (f"Prompt: { prompt !r} , Generated text: { generated_text !r} " )
31
+
32
+ if __name__ == "__main__" :
33
+ main ()
0 commit comments