File tree Expand file tree Collapse file tree 3 files changed +4
-9
lines changed
recipes/benchmarks/inference_throughput/on-prem/vllm Expand file tree Collapse file tree 3 files changed +4
-9
lines changed Original file line number Diff line number Diff line change 40
40
SAFE_CHECK = params ["SAFE_CHECK" ]
41
41
# Threshold for tokens per second below which we deem the query to be slow
42
42
THRESHOLD_TPS = params ["THRESHOLD_TPS" ]
43
- # Default Llama tokenizer, replace with your own tokenizer
44
- TOKENIZER_PATH = params ["TOKENIZER_PATH" ]
45
43
TEMPERATURE = params ["TEMPERATURE" ]
46
44
TOP_P = params ["TOP_P" ]
47
45
# Add your model endpoints here, specify the port number. You can acquire the endpoint when creating a on-prem server like vLLM.
55
53
print ("No available GPUs" )
56
54
57
55
58
- # This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
59
- tokenizer = transformers .AutoTokenizer .from_pretrained (TOKENIZER_PATH )
56
+ # This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
57
+ tokenizer = transformers .AutoTokenizer .from_pretrained (MODEL_PATH )
60
58
61
59
num_token_input_prompt = len (tokenizer .encode (PROMPT ))
62
60
print (f"Number of token for input prompt: { num_token_input_prompt } " )
Original file line number Diff line number Diff line change 5
5
"MODEL_HEADERS" : {"Content-Type" : " application/json" },
6
6
"SAFE_CHECK" : true ,
7
7
"THRESHOLD_TPS" : 7 ,
8
- "TOKENIZER_PATH" : " ../../tokenizer" ,
9
8
"RANDOM_PROMPT_LENGTH" : 1000 ,
10
9
"TEMPERATURE" : 0.6 ,
11
10
"TOP_P" : 0.9 ,
Original file line number Diff line number Diff line change 36
36
SAFE_CHECK = params ["SAFE_CHECK" ]
37
37
# Threshold for tokens per second below which we deem the query to be slow
38
38
THRESHOLD_TPS = params ["THRESHOLD_TPS" ]
39
- # Replace with your own tokenizer
40
- TOKENIZER_PATH = params ["TOKENIZER_PATH" ]
41
39
RANDOM_PROMPT_LENGTH = params ["RANDOM_PROMPT_LENGTH" ]
42
40
TEMPERATURE = params ["TEMPERATURE" ]
43
41
TOP_P = params ["TOP_P" ]
52
50
print ("No available GPUs" )
53
51
54
52
55
- # This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
56
- tokenizer = transformers .AutoTokenizer .from_pretrained (TOKENIZER_PATH )
53
+ # This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
54
+ tokenizer = transformers .AutoTokenizer .from_pretrained (MODEL_PATH )
57
55
58
56
# Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
59
57
vocab = [token for token in tokenizer .get_vocab ().keys () if len (token ) > 2 and all (ord (c ) < 128 for c in token )]
You can’t perform that action at this time.
0 commit comments