Skip to content

Commit c96ac50

Browse files
authored
Remove local tokenizer requirement for vllm on prem throughput benchmark (meta-llama#514)
2 parents 77fc13f + 816b25f commit c96ac50

File tree

3 files changed

+4
-9
lines changed

3 files changed

+4
-9
lines changed

recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@
4040
SAFE_CHECK = params["SAFE_CHECK"]
4141
# Threshold for tokens per second below which we deem the query to be slow
4242
THRESHOLD_TPS = params["THRESHOLD_TPS"]
43-
# Default Llama tokenizer, replace with your own tokenizer
44-
TOKENIZER_PATH = params["TOKENIZER_PATH"]
4543
TEMPERATURE = params["TEMPERATURE"]
4644
TOP_P = params["TOP_P"]
4745
# Add your model endpoints here, specify the port number. You can acquire the endpoint when creating a on-prem server like vLLM.
@@ -55,8 +53,8 @@
5553
print("No available GPUs")
5654

5755

58-
# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
59-
tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
56+
# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
57+
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
6058

6159
num_token_input_prompt = len(tokenizer.encode(PROMPT))
6260
print(f"Number of token for input prompt: {num_token_input_prompt}")

recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
"MODEL_HEADERS" : {"Content-Type": "application/json"},
66
"SAFE_CHECK" : true,
77
"THRESHOLD_TPS" : 7,
8-
"TOKENIZER_PATH" : "../../tokenizer",
98
"RANDOM_PROMPT_LENGTH" : 1000,
109
"TEMPERATURE" : 0.6,
1110
"TOP_P" : 0.9,

recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@
3636
SAFE_CHECK = params["SAFE_CHECK"]
3737
# Threshold for tokens per second below which we deem the query to be slow
3838
THRESHOLD_TPS = params["THRESHOLD_TPS"]
39-
# Replace with your own tokenizer
40-
TOKENIZER_PATH = params["TOKENIZER_PATH"]
4139
RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
4240
TEMPERATURE = params["TEMPERATURE"]
4341
TOP_P = params["TOP_P"]
@@ -52,8 +50,8 @@
5250
print("No available GPUs")
5351

5452

55-
# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
56-
tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
53+
# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
54+
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
5755

5856
# Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
5957
vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]

0 commit comments

Comments
 (0)